1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the AArch64TargetLowering class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64ExpandImm.h" 14 #include "AArch64ISelLowering.h" 15 #include "AArch64CallingConvention.h" 16 #include "AArch64MachineFunctionInfo.h" 17 #include "AArch64PerfectShuffle.h" 18 #include "AArch64RegisterInfo.h" 19 #include "AArch64Subtarget.h" 20 #include "MCTargetDesc/AArch64AddressingModes.h" 21 #include "Utils/AArch64BaseInfo.h" 22 #include "llvm/ADT/APFloat.h" 23 #include "llvm/ADT/APInt.h" 24 #include "llvm/ADT/ArrayRef.h" 25 #include "llvm/ADT/STLExtras.h" 26 #include "llvm/ADT/SmallVector.h" 27 #include "llvm/ADT/Statistic.h" 28 #include "llvm/ADT/StringRef.h" 29 #include "llvm/ADT/StringSwitch.h" 30 #include "llvm/ADT/Triple.h" 31 #include "llvm/ADT/Twine.h" 32 #include "llvm/Analysis/VectorUtils.h" 33 #include "llvm/CodeGen/CallingConvLower.h" 34 #include "llvm/CodeGen/MachineBasicBlock.h" 35 #include "llvm/CodeGen/MachineFrameInfo.h" 36 #include "llvm/CodeGen/MachineFunction.h" 37 #include "llvm/CodeGen/MachineInstr.h" 38 #include "llvm/CodeGen/MachineInstrBuilder.h" 39 #include "llvm/CodeGen/MachineMemOperand.h" 40 #include "llvm/CodeGen/MachineRegisterInfo.h" 41 #include "llvm/CodeGen/RuntimeLibcalls.h" 42 #include "llvm/CodeGen/SelectionDAG.h" 43 #include "llvm/CodeGen/SelectionDAGNodes.h" 44 #include "llvm/CodeGen/TargetCallingConv.h" 45 #include "llvm/CodeGen/TargetInstrInfo.h" 46 #include "llvm/CodeGen/ValueTypes.h" 47 #include "llvm/IR/Attributes.h" 48 #include "llvm/IR/Constants.h" 49 #include "llvm/IR/DataLayout.h" 50 #include "llvm/IR/DebugLoc.h" 51 #include "llvm/IR/DerivedTypes.h" 52 #include "llvm/IR/Function.h" 53 #include "llvm/IR/GetElementPtrTypeIterator.h" 54 #include "llvm/IR/GlobalValue.h" 55 #include "llvm/IR/IRBuilder.h" 56 #include "llvm/IR/Instruction.h" 57 #include "llvm/IR/Instructions.h" 58 #include "llvm/IR/IntrinsicInst.h" 59 #include "llvm/IR/Intrinsics.h" 60 #include "llvm/IR/Module.h" 61 #include "llvm/IR/OperandTraits.h" 62 #include "llvm/IR/PatternMatch.h" 63 #include "llvm/IR/Type.h" 64 #include "llvm/IR/Use.h" 65 #include "llvm/IR/Value.h" 66 #include "llvm/MC/MCRegisterInfo.h" 67 #include "llvm/Support/Casting.h" 68 #include "llvm/Support/CodeGen.h" 69 #include "llvm/Support/CommandLine.h" 70 #include "llvm/Support/Compiler.h" 71 #include "llvm/Support/Debug.h" 72 #include "llvm/Support/ErrorHandling.h" 73 #include "llvm/Support/KnownBits.h" 74 #include "llvm/Support/MachineValueType.h" 75 #include "llvm/Support/MathExtras.h" 76 #include "llvm/Support/raw_ostream.h" 77 #include "llvm/Target/TargetMachine.h" 78 #include "llvm/Target/TargetOptions.h" 79 #include <algorithm> 80 #include <bitset> 81 #include <cassert> 82 #include <cctype> 83 #include <cstdint> 84 #include <cstdlib> 85 #include <iterator> 86 #include <limits> 87 #include <tuple> 88 #include <utility> 89 #include <vector> 90 91 using namespace llvm; 92 using namespace llvm::PatternMatch; 93 94 #define DEBUG_TYPE "aarch64-lower" 95 96 STATISTIC(NumTailCalls, "Number of tail calls"); 97 STATISTIC(NumShiftInserts, "Number of vector shift inserts"); 98 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized"); 99 100 static cl::opt<bool> 101 EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, 102 cl::desc("Allow AArch64 SLI/SRI formation"), 103 cl::init(false)); 104 105 // FIXME: The necessary dtprel relocations don't seem to be supported 106 // well in the GNU bfd and gold linkers at the moment. Therefore, by 107 // default, for now, fall back to GeneralDynamic code generation. 108 cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration( 109 "aarch64-elf-ldtls-generation", cl::Hidden, 110 cl::desc("Allow AArch64 Local Dynamic TLS code generation"), 111 cl::init(false)); 112 113 static cl::opt<bool> 114 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, 115 cl::desc("Enable AArch64 logical imm instruction " 116 "optimization"), 117 cl::init(true)); 118 119 /// Value type used for condition codes. 120 static const MVT MVT_CC = MVT::i32; 121 122 AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, 123 const AArch64Subtarget &STI) 124 : TargetLowering(TM), Subtarget(&STI) { 125 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so 126 // we have to make something up. Arbitrarily, choose ZeroOrOne. 127 setBooleanContents(ZeroOrOneBooleanContent); 128 // When comparing vectors the result sets the different elements in the 129 // vector to all-one or all-zero. 130 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 131 132 // Set up the register classes. 133 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass); 134 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass); 135 136 if (Subtarget->hasFPARMv8()) { 137 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); 138 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); 139 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); 140 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); 141 } 142 143 if (Subtarget->hasNEON()) { 144 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass); 145 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass); 146 // Someone set us up the NEON. 147 addDRTypeForNEON(MVT::v2f32); 148 addDRTypeForNEON(MVT::v8i8); 149 addDRTypeForNEON(MVT::v4i16); 150 addDRTypeForNEON(MVT::v2i32); 151 addDRTypeForNEON(MVT::v1i64); 152 addDRTypeForNEON(MVT::v1f64); 153 addDRTypeForNEON(MVT::v4f16); 154 155 addQRTypeForNEON(MVT::v4f32); 156 addQRTypeForNEON(MVT::v2f64); 157 addQRTypeForNEON(MVT::v16i8); 158 addQRTypeForNEON(MVT::v8i16); 159 addQRTypeForNEON(MVT::v4i32); 160 addQRTypeForNEON(MVT::v2i64); 161 addQRTypeForNEON(MVT::v8f16); 162 } 163 164 // Compute derived properties from the register classes 165 computeRegisterProperties(Subtarget->getRegisterInfo()); 166 167 // Provide all sorts of operation actions 168 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 169 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 170 setOperationAction(ISD::SETCC, MVT::i32, Custom); 171 setOperationAction(ISD::SETCC, MVT::i64, Custom); 172 setOperationAction(ISD::SETCC, MVT::f16, Custom); 173 setOperationAction(ISD::SETCC, MVT::f32, Custom); 174 setOperationAction(ISD::SETCC, MVT::f64, Custom); 175 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 176 setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); 177 setOperationAction(ISD::BRCOND, MVT::Other, Expand); 178 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 179 setOperationAction(ISD::BR_CC, MVT::i64, Custom); 180 setOperationAction(ISD::BR_CC, MVT::f16, Custom); 181 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 182 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 183 setOperationAction(ISD::SELECT, MVT::i32, Custom); 184 setOperationAction(ISD::SELECT, MVT::i64, Custom); 185 setOperationAction(ISD::SELECT, MVT::f16, Custom); 186 setOperationAction(ISD::SELECT, MVT::f32, Custom); 187 setOperationAction(ISD::SELECT, MVT::f64, Custom); 188 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 189 setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); 190 setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); 191 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 192 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 193 setOperationAction(ISD::BR_JT, MVT::Other, Custom); 194 setOperationAction(ISD::JumpTable, MVT::i64, Custom); 195 196 setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); 197 setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom); 198 setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom); 199 200 setOperationAction(ISD::FREM, MVT::f32, Expand); 201 setOperationAction(ISD::FREM, MVT::f64, Expand); 202 setOperationAction(ISD::FREM, MVT::f80, Expand); 203 204 setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand); 205 206 // Custom lowering hooks are needed for XOR 207 // to fold it into CSINC/CSINV. 208 setOperationAction(ISD::XOR, MVT::i32, Custom); 209 setOperationAction(ISD::XOR, MVT::i64, Custom); 210 211 // Virtually no operation on f128 is legal, but LLVM can't expand them when 212 // there's a valid register class, so we need custom operations in most cases. 213 setOperationAction(ISD::FABS, MVT::f128, Expand); 214 setOperationAction(ISD::FADD, MVT::f128, Custom); 215 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); 216 setOperationAction(ISD::FCOS, MVT::f128, Expand); 217 setOperationAction(ISD::FDIV, MVT::f128, Custom); 218 setOperationAction(ISD::FMA, MVT::f128, Expand); 219 setOperationAction(ISD::FMUL, MVT::f128, Custom); 220 setOperationAction(ISD::FNEG, MVT::f128, Expand); 221 setOperationAction(ISD::FPOW, MVT::f128, Expand); 222 setOperationAction(ISD::FREM, MVT::f128, Expand); 223 setOperationAction(ISD::FRINT, MVT::f128, Expand); 224 setOperationAction(ISD::FSIN, MVT::f128, Expand); 225 setOperationAction(ISD::FSINCOS, MVT::f128, Expand); 226 setOperationAction(ISD::FSQRT, MVT::f128, Expand); 227 setOperationAction(ISD::FSUB, MVT::f128, Custom); 228 setOperationAction(ISD::FTRUNC, MVT::f128, Expand); 229 setOperationAction(ISD::SETCC, MVT::f128, Custom); 230 setOperationAction(ISD::BR_CC, MVT::f128, Custom); 231 setOperationAction(ISD::SELECT, MVT::f128, Custom); 232 setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); 233 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); 234 235 // Lowering for many of the conversions is actually specified by the non-f128 236 // type. The LowerXXX function will be trivial when f128 isn't involved. 237 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 238 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 239 setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); 240 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 241 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 242 setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); 243 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 244 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); 245 setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); 246 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 247 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); 248 setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); 249 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 250 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); 251 252 // Variable arguments. 253 setOperationAction(ISD::VASTART, MVT::Other, Custom); 254 setOperationAction(ISD::VAARG, MVT::Other, Custom); 255 setOperationAction(ISD::VACOPY, MVT::Other, Custom); 256 setOperationAction(ISD::VAEND, MVT::Other, Expand); 257 258 // Variable-sized objects. 259 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 260 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 261 262 if (Subtarget->isTargetWindows()) 263 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); 264 else 265 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 266 267 // Constant pool entries 268 setOperationAction(ISD::ConstantPool, MVT::i64, Custom); 269 270 // BlockAddress 271 setOperationAction(ISD::BlockAddress, MVT::i64, Custom); 272 273 // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences. 274 setOperationAction(ISD::ADDC, MVT::i32, Custom); 275 setOperationAction(ISD::ADDE, MVT::i32, Custom); 276 setOperationAction(ISD::SUBC, MVT::i32, Custom); 277 setOperationAction(ISD::SUBE, MVT::i32, Custom); 278 setOperationAction(ISD::ADDC, MVT::i64, Custom); 279 setOperationAction(ISD::ADDE, MVT::i64, Custom); 280 setOperationAction(ISD::SUBC, MVT::i64, Custom); 281 setOperationAction(ISD::SUBE, MVT::i64, Custom); 282 283 // AArch64 lacks both left-rotate and popcount instructions. 284 setOperationAction(ISD::ROTL, MVT::i32, Expand); 285 setOperationAction(ISD::ROTL, MVT::i64, Expand); 286 for (MVT VT : MVT::vector_valuetypes()) { 287 setOperationAction(ISD::ROTL, VT, Expand); 288 setOperationAction(ISD::ROTR, VT, Expand); 289 } 290 291 // AArch64 doesn't have {U|S}MUL_LOHI. 292 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); 293 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); 294 295 setOperationAction(ISD::CTPOP, MVT::i32, Custom); 296 setOperationAction(ISD::CTPOP, MVT::i64, Custom); 297 298 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 299 setOperationAction(ISD::SDIVREM, MVT::i64, Expand); 300 for (MVT VT : MVT::vector_valuetypes()) { 301 setOperationAction(ISD::SDIVREM, VT, Expand); 302 setOperationAction(ISD::UDIVREM, VT, Expand); 303 } 304 setOperationAction(ISD::SREM, MVT::i32, Expand); 305 setOperationAction(ISD::SREM, MVT::i64, Expand); 306 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 307 setOperationAction(ISD::UDIVREM, MVT::i64, Expand); 308 setOperationAction(ISD::UREM, MVT::i32, Expand); 309 setOperationAction(ISD::UREM, MVT::i64, Expand); 310 311 // Custom lower Add/Sub/Mul with overflow. 312 setOperationAction(ISD::SADDO, MVT::i32, Custom); 313 setOperationAction(ISD::SADDO, MVT::i64, Custom); 314 setOperationAction(ISD::UADDO, MVT::i32, Custom); 315 setOperationAction(ISD::UADDO, MVT::i64, Custom); 316 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 317 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 318 setOperationAction(ISD::USUBO, MVT::i32, Custom); 319 setOperationAction(ISD::USUBO, MVT::i64, Custom); 320 setOperationAction(ISD::SMULO, MVT::i32, Custom); 321 setOperationAction(ISD::SMULO, MVT::i64, Custom); 322 setOperationAction(ISD::UMULO, MVT::i32, Custom); 323 setOperationAction(ISD::UMULO, MVT::i64, Custom); 324 325 setOperationAction(ISD::FSIN, MVT::f32, Expand); 326 setOperationAction(ISD::FSIN, MVT::f64, Expand); 327 setOperationAction(ISD::FCOS, MVT::f32, Expand); 328 setOperationAction(ISD::FCOS, MVT::f64, Expand); 329 setOperationAction(ISD::FPOW, MVT::f32, Expand); 330 setOperationAction(ISD::FPOW, MVT::f64, Expand); 331 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 332 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 333 if (Subtarget->hasFullFP16()) 334 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom); 335 else 336 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); 337 338 setOperationAction(ISD::FREM, MVT::f16, Promote); 339 setOperationAction(ISD::FREM, MVT::v4f16, Expand); 340 setOperationAction(ISD::FREM, MVT::v8f16, Expand); 341 setOperationAction(ISD::FPOW, MVT::f16, Promote); 342 setOperationAction(ISD::FPOW, MVT::v4f16, Expand); 343 setOperationAction(ISD::FPOW, MVT::v8f16, Expand); 344 setOperationAction(ISD::FPOWI, MVT::f16, Promote); 345 setOperationAction(ISD::FPOWI, MVT::v4f16, Expand); 346 setOperationAction(ISD::FPOWI, MVT::v8f16, Expand); 347 setOperationAction(ISD::FCOS, MVT::f16, Promote); 348 setOperationAction(ISD::FCOS, MVT::v4f16, Expand); 349 setOperationAction(ISD::FCOS, MVT::v8f16, Expand); 350 setOperationAction(ISD::FSIN, MVT::f16, Promote); 351 setOperationAction(ISD::FSIN, MVT::v4f16, Expand); 352 setOperationAction(ISD::FSIN, MVT::v8f16, Expand); 353 setOperationAction(ISD::FSINCOS, MVT::f16, Promote); 354 setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand); 355 setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand); 356 setOperationAction(ISD::FEXP, MVT::f16, Promote); 357 setOperationAction(ISD::FEXP, MVT::v4f16, Expand); 358 setOperationAction(ISD::FEXP, MVT::v8f16, Expand); 359 setOperationAction(ISD::FEXP2, MVT::f16, Promote); 360 setOperationAction(ISD::FEXP2, MVT::v4f16, Expand); 361 setOperationAction(ISD::FEXP2, MVT::v8f16, Expand); 362 setOperationAction(ISD::FLOG, MVT::f16, Promote); 363 setOperationAction(ISD::FLOG, MVT::v4f16, Expand); 364 setOperationAction(ISD::FLOG, MVT::v8f16, Expand); 365 setOperationAction(ISD::FLOG2, MVT::f16, Promote); 366 setOperationAction(ISD::FLOG2, MVT::v4f16, Expand); 367 setOperationAction(ISD::FLOG2, MVT::v8f16, Expand); 368 setOperationAction(ISD::FLOG10, MVT::f16, Promote); 369 setOperationAction(ISD::FLOG10, MVT::v4f16, Expand); 370 setOperationAction(ISD::FLOG10, MVT::v8f16, Expand); 371 372 if (!Subtarget->hasFullFP16()) { 373 setOperationAction(ISD::SELECT, MVT::f16, Promote); 374 setOperationAction(ISD::SELECT_CC, MVT::f16, Promote); 375 setOperationAction(ISD::SETCC, MVT::f16, Promote); 376 setOperationAction(ISD::BR_CC, MVT::f16, Promote); 377 setOperationAction(ISD::FADD, MVT::f16, Promote); 378 setOperationAction(ISD::FSUB, MVT::f16, Promote); 379 setOperationAction(ISD::FMUL, MVT::f16, Promote); 380 setOperationAction(ISD::FDIV, MVT::f16, Promote); 381 setOperationAction(ISD::FMA, MVT::f16, Promote); 382 setOperationAction(ISD::FNEG, MVT::f16, Promote); 383 setOperationAction(ISD::FABS, MVT::f16, Promote); 384 setOperationAction(ISD::FCEIL, MVT::f16, Promote); 385 setOperationAction(ISD::FSQRT, MVT::f16, Promote); 386 setOperationAction(ISD::FFLOOR, MVT::f16, Promote); 387 setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote); 388 setOperationAction(ISD::FRINT, MVT::f16, Promote); 389 setOperationAction(ISD::FROUND, MVT::f16, Promote); 390 setOperationAction(ISD::FTRUNC, MVT::f16, Promote); 391 setOperationAction(ISD::FMINNUM, MVT::f16, Promote); 392 setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); 393 setOperationAction(ISD::FMINIMUM, MVT::f16, Promote); 394 setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote); 395 396 // promote v4f16 to v4f32 when that is known to be safe. 397 setOperationAction(ISD::FADD, MVT::v4f16, Promote); 398 setOperationAction(ISD::FSUB, MVT::v4f16, Promote); 399 setOperationAction(ISD::FMUL, MVT::v4f16, Promote); 400 setOperationAction(ISD::FDIV, MVT::v4f16, Promote); 401 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote); 402 setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote); 403 AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32); 404 AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32); 405 AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32); 406 AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32); 407 AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32); 408 AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32); 409 410 setOperationAction(ISD::FABS, MVT::v4f16, Expand); 411 setOperationAction(ISD::FNEG, MVT::v4f16, Expand); 412 setOperationAction(ISD::FROUND, MVT::v4f16, Expand); 413 setOperationAction(ISD::FMA, MVT::v4f16, Expand); 414 setOperationAction(ISD::SETCC, MVT::v4f16, Expand); 415 setOperationAction(ISD::BR_CC, MVT::v4f16, Expand); 416 setOperationAction(ISD::SELECT, MVT::v4f16, Expand); 417 setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand); 418 setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand); 419 setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand); 420 setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand); 421 setOperationAction(ISD::FCEIL, MVT::v4f16, Expand); 422 setOperationAction(ISD::FRINT, MVT::v4f16, Expand); 423 setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand); 424 setOperationAction(ISD::FSQRT, MVT::v4f16, Expand); 425 426 setOperationAction(ISD::FABS, MVT::v8f16, Expand); 427 setOperationAction(ISD::FADD, MVT::v8f16, Expand); 428 setOperationAction(ISD::FCEIL, MVT::v8f16, Expand); 429 setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand); 430 setOperationAction(ISD::FDIV, MVT::v8f16, Expand); 431 setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand); 432 setOperationAction(ISD::FMA, MVT::v8f16, Expand); 433 setOperationAction(ISD::FMUL, MVT::v8f16, Expand); 434 setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand); 435 setOperationAction(ISD::FNEG, MVT::v8f16, Expand); 436 setOperationAction(ISD::FROUND, MVT::v8f16, Expand); 437 setOperationAction(ISD::FRINT, MVT::v8f16, Expand); 438 setOperationAction(ISD::FSQRT, MVT::v8f16, Expand); 439 setOperationAction(ISD::FSUB, MVT::v8f16, Expand); 440 setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand); 441 setOperationAction(ISD::SETCC, MVT::v8f16, Expand); 442 setOperationAction(ISD::BR_CC, MVT::v8f16, Expand); 443 setOperationAction(ISD::SELECT, MVT::v8f16, Expand); 444 setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand); 445 setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand); 446 } 447 448 // AArch64 has implementations of a lot of rounding-like FP operations. 449 for (MVT Ty : {MVT::f32, MVT::f64}) { 450 setOperationAction(ISD::FFLOOR, Ty, Legal); 451 setOperationAction(ISD::FNEARBYINT, Ty, Legal); 452 setOperationAction(ISD::FCEIL, Ty, Legal); 453 setOperationAction(ISD::FRINT, Ty, Legal); 454 setOperationAction(ISD::FTRUNC, Ty, Legal); 455 setOperationAction(ISD::FROUND, Ty, Legal); 456 setOperationAction(ISD::FMINNUM, Ty, Legal); 457 setOperationAction(ISD::FMAXNUM, Ty, Legal); 458 setOperationAction(ISD::FMINIMUM, Ty, Legal); 459 setOperationAction(ISD::FMAXIMUM, Ty, Legal); 460 setOperationAction(ISD::LROUND, Ty, Legal); 461 setOperationAction(ISD::LLROUND, Ty, Legal); 462 setOperationAction(ISD::LRINT, Ty, Legal); 463 setOperationAction(ISD::LLRINT, Ty, Legal); 464 } 465 466 if (Subtarget->hasFullFP16()) { 467 setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal); 468 setOperationAction(ISD::FFLOOR, MVT::f16, Legal); 469 setOperationAction(ISD::FCEIL, MVT::f16, Legal); 470 setOperationAction(ISD::FRINT, MVT::f16, Legal); 471 setOperationAction(ISD::FTRUNC, MVT::f16, Legal); 472 setOperationAction(ISD::FROUND, MVT::f16, Legal); 473 setOperationAction(ISD::FMINNUM, MVT::f16, Legal); 474 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); 475 setOperationAction(ISD::FMINIMUM, MVT::f16, Legal); 476 setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal); 477 } 478 479 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 480 481 setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); 482 483 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); 484 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); 485 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 486 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom); 487 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 488 489 // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. 490 // This requires the Performance Monitors extension. 491 if (Subtarget->hasPerfMon()) 492 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); 493 494 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && 495 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { 496 // Issue __sincos_stret if available. 497 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 498 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 499 } else { 500 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 501 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 502 } 503 504 // Make floating-point constants legal for the large code model, so they don't 505 // become loads from the constant pool. 506 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) { 507 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 508 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 509 } 510 511 // AArch64 does not have floating-point extending loads, i1 sign-extending 512 // load, floating-point truncating stores, or v2i32->v2i16 truncating store. 513 for (MVT VT : MVT::fp_valuetypes()) { 514 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); 515 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 516 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand); 517 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand); 518 } 519 for (MVT VT : MVT::integer_valuetypes()) 520 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand); 521 522 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 523 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 524 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 525 setTruncStoreAction(MVT::f128, MVT::f80, Expand); 526 setTruncStoreAction(MVT::f128, MVT::f64, Expand); 527 setTruncStoreAction(MVT::f128, MVT::f32, Expand); 528 setTruncStoreAction(MVT::f128, MVT::f16, Expand); 529 530 setOperationAction(ISD::BITCAST, MVT::i16, Custom); 531 setOperationAction(ISD::BITCAST, MVT::f16, Custom); 532 533 // Indexed loads and stores are supported. 534 for (unsigned im = (unsigned)ISD::PRE_INC; 535 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 536 setIndexedLoadAction(im, MVT::i8, Legal); 537 setIndexedLoadAction(im, MVT::i16, Legal); 538 setIndexedLoadAction(im, MVT::i32, Legal); 539 setIndexedLoadAction(im, MVT::i64, Legal); 540 setIndexedLoadAction(im, MVT::f64, Legal); 541 setIndexedLoadAction(im, MVT::f32, Legal); 542 setIndexedLoadAction(im, MVT::f16, Legal); 543 setIndexedStoreAction(im, MVT::i8, Legal); 544 setIndexedStoreAction(im, MVT::i16, Legal); 545 setIndexedStoreAction(im, MVT::i32, Legal); 546 setIndexedStoreAction(im, MVT::i64, Legal); 547 setIndexedStoreAction(im, MVT::f64, Legal); 548 setIndexedStoreAction(im, MVT::f32, Legal); 549 setIndexedStoreAction(im, MVT::f16, Legal); 550 } 551 552 // Trap. 553 setOperationAction(ISD::TRAP, MVT::Other, Legal); 554 if (Subtarget->isTargetWindows()) 555 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); 556 557 // We combine OR nodes for bitfield operations. 558 setTargetDAGCombine(ISD::OR); 559 // Try to create BICs for vector ANDs. 560 setTargetDAGCombine(ISD::AND); 561 562 // Vector add and sub nodes may conceal a high-half opportunity. 563 // Also, try to fold ADD into CSINC/CSINV.. 564 setTargetDAGCombine(ISD::ADD); 565 setTargetDAGCombine(ISD::SUB); 566 setTargetDAGCombine(ISD::SRL); 567 setTargetDAGCombine(ISD::XOR); 568 setTargetDAGCombine(ISD::SINT_TO_FP); 569 setTargetDAGCombine(ISD::UINT_TO_FP); 570 571 setTargetDAGCombine(ISD::FP_TO_SINT); 572 setTargetDAGCombine(ISD::FP_TO_UINT); 573 setTargetDAGCombine(ISD::FDIV); 574 575 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 576 577 setTargetDAGCombine(ISD::ANY_EXTEND); 578 setTargetDAGCombine(ISD::ZERO_EXTEND); 579 setTargetDAGCombine(ISD::SIGN_EXTEND); 580 setTargetDAGCombine(ISD::BITCAST); 581 setTargetDAGCombine(ISD::CONCAT_VECTORS); 582 setTargetDAGCombine(ISD::STORE); 583 if (Subtarget->supportsAddressTopByteIgnored()) 584 setTargetDAGCombine(ISD::LOAD); 585 586 setTargetDAGCombine(ISD::MUL); 587 588 setTargetDAGCombine(ISD::SELECT); 589 setTargetDAGCombine(ISD::VSELECT); 590 591 setTargetDAGCombine(ISD::INTRINSIC_VOID); 592 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 593 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 594 595 setTargetDAGCombine(ISD::GlobalAddress); 596 597 // In case of strict alignment, avoid an excessive number of byte wide stores. 598 MaxStoresPerMemsetOptSize = 8; 599 MaxStoresPerMemset = Subtarget->requiresStrictAlign() 600 ? MaxStoresPerMemsetOptSize : 32; 601 602 MaxGluedStoresPerMemcpy = 4; 603 MaxStoresPerMemcpyOptSize = 4; 604 MaxStoresPerMemcpy = Subtarget->requiresStrictAlign() 605 ? MaxStoresPerMemcpyOptSize : 16; 606 607 MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4; 608 609 MaxLoadsPerMemcmpOptSize = 4; 610 MaxLoadsPerMemcmp = Subtarget->requiresStrictAlign() 611 ? MaxLoadsPerMemcmpOptSize : 8; 612 613 setStackPointerRegisterToSaveRestore(AArch64::SP); 614 615 setSchedulingPreference(Sched::Hybrid); 616 617 EnableExtLdPromotion = true; 618 619 // Set required alignment. 620 setMinFunctionAlignment(2); 621 // Set preferred alignments. 622 setPrefFunctionAlignment(STI.getPrefFunctionAlignment()); 623 setPrefLoopAlignment(STI.getPrefLoopAlignment()); 624 625 // Only change the limit for entries in a jump table if specified by 626 // the sub target, but not at the command line. 627 unsigned MaxJT = STI.getMaximumJumpTableSize(); 628 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX) 629 setMaximumJumpTableSize(MaxJT); 630 631 setHasExtractBitsInsn(true); 632 633 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 634 635 if (Subtarget->hasNEON()) { 636 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to 637 // silliness like this: 638 setOperationAction(ISD::FABS, MVT::v1f64, Expand); 639 setOperationAction(ISD::FADD, MVT::v1f64, Expand); 640 setOperationAction(ISD::FCEIL, MVT::v1f64, Expand); 641 setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand); 642 setOperationAction(ISD::FCOS, MVT::v1f64, Expand); 643 setOperationAction(ISD::FDIV, MVT::v1f64, Expand); 644 setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand); 645 setOperationAction(ISD::FMA, MVT::v1f64, Expand); 646 setOperationAction(ISD::FMUL, MVT::v1f64, Expand); 647 setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand); 648 setOperationAction(ISD::FNEG, MVT::v1f64, Expand); 649 setOperationAction(ISD::FPOW, MVT::v1f64, Expand); 650 setOperationAction(ISD::FREM, MVT::v1f64, Expand); 651 setOperationAction(ISD::FROUND, MVT::v1f64, Expand); 652 setOperationAction(ISD::FRINT, MVT::v1f64, Expand); 653 setOperationAction(ISD::FSIN, MVT::v1f64, Expand); 654 setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand); 655 setOperationAction(ISD::FSQRT, MVT::v1f64, Expand); 656 setOperationAction(ISD::FSUB, MVT::v1f64, Expand); 657 setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand); 658 setOperationAction(ISD::SETCC, MVT::v1f64, Expand); 659 setOperationAction(ISD::BR_CC, MVT::v1f64, Expand); 660 setOperationAction(ISD::SELECT, MVT::v1f64, Expand); 661 setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand); 662 setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand); 663 664 setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand); 665 setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand); 666 setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand); 667 setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand); 668 setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand); 669 670 setOperationAction(ISD::MUL, MVT::v1i64, Expand); 671 672 // AArch64 doesn't have a direct vector ->f32 conversion instructions for 673 // elements smaller than i32, so promote the input to i32 first. 674 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32); 675 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32); 676 // i8 vector elements also need promotion to i32 for v8i8 677 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32); 678 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32); 679 // Similarly, there is no direct i32 -> f64 vector conversion instruction. 680 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 681 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); 682 setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom); 683 setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom); 684 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the 685 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16 686 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom); 687 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); 688 689 if (Subtarget->hasFullFP16()) { 690 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); 691 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 692 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom); 693 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); 694 } else { 695 // when AArch64 doesn't have fullfp16 support, promote the input 696 // to i32 first. 697 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32); 698 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32); 699 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32); 700 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32); 701 } 702 703 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); 704 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); 705 706 // AArch64 doesn't have MUL.2d: 707 setOperationAction(ISD::MUL, MVT::v2i64, Expand); 708 // Custom handling for some quad-vector types to detect MULL. 709 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 710 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 711 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 712 713 // Vector reductions 714 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, 715 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { 716 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); 717 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); 718 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); 719 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); 720 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); 721 } 722 for (MVT VT : { MVT::v4f16, MVT::v2f32, 723 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) { 724 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); 725 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); 726 } 727 728 setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); 729 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); 730 // Likewise, narrowing and extending vector loads/stores aren't handled 731 // directly. 732 for (MVT VT : MVT::vector_valuetypes()) { 733 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 734 735 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) { 736 setOperationAction(ISD::MULHS, VT, Legal); 737 setOperationAction(ISD::MULHU, VT, Legal); 738 } else { 739 setOperationAction(ISD::MULHS, VT, Expand); 740 setOperationAction(ISD::MULHU, VT, Expand); 741 } 742 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 743 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 744 745 setOperationAction(ISD::BSWAP, VT, Expand); 746 setOperationAction(ISD::CTTZ, VT, Expand); 747 748 for (MVT InnerVT : MVT::vector_valuetypes()) { 749 setTruncStoreAction(VT, InnerVT, Expand); 750 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); 751 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); 752 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); 753 } 754 } 755 756 // AArch64 has implementations of a lot of rounding-like FP operations. 757 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) { 758 setOperationAction(ISD::FFLOOR, Ty, Legal); 759 setOperationAction(ISD::FNEARBYINT, Ty, Legal); 760 setOperationAction(ISD::FCEIL, Ty, Legal); 761 setOperationAction(ISD::FRINT, Ty, Legal); 762 setOperationAction(ISD::FTRUNC, Ty, Legal); 763 setOperationAction(ISD::FROUND, Ty, Legal); 764 } 765 766 if (Subtarget->hasFullFP16()) { 767 for (MVT Ty : {MVT::v4f16, MVT::v8f16}) { 768 setOperationAction(ISD::FFLOOR, Ty, Legal); 769 setOperationAction(ISD::FNEARBYINT, Ty, Legal); 770 setOperationAction(ISD::FCEIL, Ty, Legal); 771 setOperationAction(ISD::FRINT, Ty, Legal); 772 setOperationAction(ISD::FTRUNC, Ty, Legal); 773 setOperationAction(ISD::FROUND, Ty, Legal); 774 } 775 } 776 777 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom); 778 } 779 780 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); 781 } 782 783 void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) { 784 assert(VT.isVector() && "VT should be a vector type"); 785 786 if (VT.isFloatingPoint()) { 787 MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT(); 788 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo); 789 setOperationPromotedToType(ISD::STORE, VT, PromoteTo); 790 } 791 792 // Mark vector float intrinsics as expand. 793 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) { 794 setOperationAction(ISD::FSIN, VT, Expand); 795 setOperationAction(ISD::FCOS, VT, Expand); 796 setOperationAction(ISD::FPOW, VT, Expand); 797 setOperationAction(ISD::FLOG, VT, Expand); 798 setOperationAction(ISD::FLOG2, VT, Expand); 799 setOperationAction(ISD::FLOG10, VT, Expand); 800 setOperationAction(ISD::FEXP, VT, Expand); 801 setOperationAction(ISD::FEXP2, VT, Expand); 802 803 // But we do support custom-lowering for FCOPYSIGN. 804 setOperationAction(ISD::FCOPYSIGN, VT, Custom); 805 } 806 807 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 808 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 809 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 810 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 811 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 812 setOperationAction(ISD::SRA, VT, Custom); 813 setOperationAction(ISD::SRL, VT, Custom); 814 setOperationAction(ISD::SHL, VT, Custom); 815 setOperationAction(ISD::OR, VT, Custom); 816 setOperationAction(ISD::SETCC, VT, Custom); 817 setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); 818 819 setOperationAction(ISD::SELECT, VT, Expand); 820 setOperationAction(ISD::SELECT_CC, VT, Expand); 821 setOperationAction(ISD::VSELECT, VT, Expand); 822 for (MVT InnerVT : MVT::all_valuetypes()) 823 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); 824 825 // CNT supports only B element sizes, then use UADDLP to widen. 826 if (VT != MVT::v8i8 && VT != MVT::v16i8) 827 setOperationAction(ISD::CTPOP, VT, Custom); 828 829 setOperationAction(ISD::UDIV, VT, Expand); 830 setOperationAction(ISD::SDIV, VT, Expand); 831 setOperationAction(ISD::UREM, VT, Expand); 832 setOperationAction(ISD::SREM, VT, Expand); 833 setOperationAction(ISD::FREM, VT, Expand); 834 835 setOperationAction(ISD::FP_TO_SINT, VT, Custom); 836 setOperationAction(ISD::FP_TO_UINT, VT, Custom); 837 838 if (!VT.isFloatingPoint()) 839 setOperationAction(ISD::ABS, VT, Legal); 840 841 // [SU][MIN|MAX] are available for all NEON types apart from i64. 842 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64) 843 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) 844 setOperationAction(Opcode, VT, Legal); 845 846 // F[MIN|MAX][NUM|NAN] are available for all FP NEON types. 847 if (VT.isFloatingPoint() && 848 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16())) 849 for (unsigned Opcode : 850 {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM}) 851 setOperationAction(Opcode, VT, Legal); 852 853 if (Subtarget->isLittleEndian()) { 854 for (unsigned im = (unsigned)ISD::PRE_INC; 855 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 856 setIndexedLoadAction(im, VT, Legal); 857 setIndexedStoreAction(im, VT, Legal); 858 } 859 } 860 } 861 862 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) { 863 addRegisterClass(VT, &AArch64::FPR64RegClass); 864 addTypeForNEON(VT, MVT::v2i32); 865 } 866 867 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) { 868 addRegisterClass(VT, &AArch64::FPR128RegClass); 869 addTypeForNEON(VT, MVT::v4i32); 870 } 871 872 EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &, 873 EVT VT) const { 874 if (!VT.isVector()) 875 return MVT::i32; 876 return VT.changeVectorElementTypeToInteger(); 877 } 878 879 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, 880 const APInt &Demanded, 881 TargetLowering::TargetLoweringOpt &TLO, 882 unsigned NewOpc) { 883 uint64_t OldImm = Imm, NewImm, Enc; 884 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask; 885 886 // Return if the immediate is already all zeros, all ones, a bimm32 or a 887 // bimm64. 888 if (Imm == 0 || Imm == Mask || 889 AArch64_AM::isLogicalImmediate(Imm & Mask, Size)) 890 return false; 891 892 unsigned EltSize = Size; 893 uint64_t DemandedBits = Demanded.getZExtValue(); 894 895 // Clear bits that are not demanded. 896 Imm &= DemandedBits; 897 898 while (true) { 899 // The goal here is to set the non-demanded bits in a way that minimizes 900 // the number of switching between 0 and 1. In order to achieve this goal, 901 // we set the non-demanded bits to the value of the preceding demanded bits. 902 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a 903 // non-demanded bit), we copy bit0 (1) to the least significant 'x', 904 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'. 905 // The final result is 0b11000011. 906 uint64_t NonDemandedBits = ~DemandedBits; 907 uint64_t InvertedImm = ~Imm & DemandedBits; 908 uint64_t RotatedImm = 909 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) & 910 NonDemandedBits; 911 uint64_t Sum = RotatedImm + NonDemandedBits; 912 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1)); 913 uint64_t Ones = (Sum + Carry) & NonDemandedBits; 914 NewImm = (Imm | Ones) & Mask; 915 916 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate 917 // or all-ones or all-zeros, in which case we can stop searching. Otherwise, 918 // we halve the element size and continue the search. 919 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask))) 920 break; 921 922 // We cannot shrink the element size any further if it is 2-bits. 923 if (EltSize == 2) 924 return false; 925 926 EltSize /= 2; 927 Mask >>= EltSize; 928 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize; 929 930 // Return if there is mismatch in any of the demanded bits of Imm and Hi. 931 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0) 932 return false; 933 934 // Merge the upper and lower halves of Imm and DemandedBits. 935 Imm |= Hi; 936 DemandedBits |= DemandedBitsHi; 937 } 938 939 ++NumOptimizedImms; 940 941 // Replicate the element across the register width. 942 while (EltSize < Size) { 943 NewImm |= NewImm << EltSize; 944 EltSize *= 2; 945 } 946 947 (void)OldImm; 948 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 && 949 "demanded bits should never be altered"); 950 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm"); 951 952 // Create the new constant immediate node. 953 EVT VT = Op.getValueType(); 954 SDLoc DL(Op); 955 SDValue New; 956 957 // If the new constant immediate is all-zeros or all-ones, let the target 958 // independent DAG combine optimize this node. 959 if (NewImm == 0 || NewImm == OrigMask) { 960 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0), 961 TLO.DAG.getConstant(NewImm, DL, VT)); 962 // Otherwise, create a machine node so that target independent DAG combine 963 // doesn't undo this optimization. 964 } else { 965 Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size); 966 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT); 967 New = SDValue( 968 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0); 969 } 970 971 return TLO.CombineTo(Op, New); 972 } 973 974 bool AArch64TargetLowering::targetShrinkDemandedConstant( 975 SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const { 976 // Delay this optimization to as late as possible. 977 if (!TLO.LegalOps) 978 return false; 979 980 if (!EnableOptimizeLogicalImm) 981 return false; 982 983 EVT VT = Op.getValueType(); 984 if (VT.isVector()) 985 return false; 986 987 unsigned Size = VT.getSizeInBits(); 988 assert((Size == 32 || Size == 64) && 989 "i32 or i64 is expected after legalization."); 990 991 // Exit early if we demand all bits. 992 if (Demanded.countPopulation() == Size) 993 return false; 994 995 unsigned NewOpc; 996 switch (Op.getOpcode()) { 997 default: 998 return false; 999 case ISD::AND: 1000 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri; 1001 break; 1002 case ISD::OR: 1003 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri; 1004 break; 1005 case ISD::XOR: 1006 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri; 1007 break; 1008 } 1009 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 1010 if (!C) 1011 return false; 1012 uint64_t Imm = C->getZExtValue(); 1013 return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc); 1014 } 1015 1016 /// computeKnownBitsForTargetNode - Determine which of the bits specified in 1017 /// Mask are known to be either zero or one and return them Known. 1018 void AArch64TargetLowering::computeKnownBitsForTargetNode( 1019 const SDValue Op, KnownBits &Known, 1020 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { 1021 switch (Op.getOpcode()) { 1022 default: 1023 break; 1024 case AArch64ISD::CSEL: { 1025 KnownBits Known2; 1026 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); 1027 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); 1028 Known.Zero &= Known2.Zero; 1029 Known.One &= Known2.One; 1030 break; 1031 } 1032 case ISD::INTRINSIC_W_CHAIN: { 1033 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); 1034 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 1035 switch (IntID) { 1036 default: return; 1037 case Intrinsic::aarch64_ldaxr: 1038 case Intrinsic::aarch64_ldxr: { 1039 unsigned BitWidth = Known.getBitWidth(); 1040 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); 1041 unsigned MemBits = VT.getScalarSizeInBits(); 1042 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); 1043 return; 1044 } 1045 } 1046 break; 1047 } 1048 case ISD::INTRINSIC_WO_CHAIN: 1049 case ISD::INTRINSIC_VOID: { 1050 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 1051 switch (IntNo) { 1052 default: 1053 break; 1054 case Intrinsic::aarch64_neon_umaxv: 1055 case Intrinsic::aarch64_neon_uminv: { 1056 // Figure out the datatype of the vector operand. The UMINV instruction 1057 // will zero extend the result, so we can mark as known zero all the 1058 // bits larger than the element datatype. 32-bit or larget doesn't need 1059 // this as those are legal types and will be handled by isel directly. 1060 MVT VT = Op.getOperand(1).getValueType().getSimpleVT(); 1061 unsigned BitWidth = Known.getBitWidth(); 1062 if (VT == MVT::v8i8 || VT == MVT::v16i8) { 1063 assert(BitWidth >= 8 && "Unexpected width!"); 1064 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8); 1065 Known.Zero |= Mask; 1066 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) { 1067 assert(BitWidth >= 16 && "Unexpected width!"); 1068 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16); 1069 Known.Zero |= Mask; 1070 } 1071 break; 1072 } break; 1073 } 1074 } 1075 } 1076 } 1077 1078 MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL, 1079 EVT) const { 1080 return MVT::i64; 1081 } 1082 1083 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( 1084 EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, 1085 bool *Fast) const { 1086 if (Subtarget->requiresStrictAlign()) 1087 return false; 1088 1089 if (Fast) { 1090 // Some CPUs are fine with unaligned stores except for 128-bit ones. 1091 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 || 1092 // See comments in performSTORECombine() for more details about 1093 // these conditions. 1094 1095 // Code that uses clang vector extensions can mark that it 1096 // wants unaligned accesses to be treated as fast by 1097 // underspecifying alignment to be 1 or 2. 1098 Align <= 2 || 1099 1100 // Disregard v2i64. Memcpy lowering produces those and splitting 1101 // them regresses performance on micro-benchmarks and olden/bh. 1102 VT == MVT::v2i64; 1103 } 1104 return true; 1105 } 1106 1107 FastISel * 1108 AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 1109 const TargetLibraryInfo *libInfo) const { 1110 return AArch64::createFastISel(funcInfo, libInfo); 1111 } 1112 1113 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { 1114 switch ((AArch64ISD::NodeType)Opcode) { 1115 case AArch64ISD::FIRST_NUMBER: break; 1116 case AArch64ISD::CALL: return "AArch64ISD::CALL"; 1117 case AArch64ISD::ADRP: return "AArch64ISD::ADRP"; 1118 case AArch64ISD::ADR: return "AArch64ISD::ADR"; 1119 case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow"; 1120 case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot"; 1121 case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG"; 1122 case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND"; 1123 case AArch64ISD::CSEL: return "AArch64ISD::CSEL"; 1124 case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL"; 1125 case AArch64ISD::CSINV: return "AArch64ISD::CSINV"; 1126 case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG"; 1127 case AArch64ISD::CSINC: return "AArch64ISD::CSINC"; 1128 case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER"; 1129 case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ"; 1130 case AArch64ISD::ADC: return "AArch64ISD::ADC"; 1131 case AArch64ISD::SBC: return "AArch64ISD::SBC"; 1132 case AArch64ISD::ADDS: return "AArch64ISD::ADDS"; 1133 case AArch64ISD::SUBS: return "AArch64ISD::SUBS"; 1134 case AArch64ISD::ADCS: return "AArch64ISD::ADCS"; 1135 case AArch64ISD::SBCS: return "AArch64ISD::SBCS"; 1136 case AArch64ISD::ANDS: return "AArch64ISD::ANDS"; 1137 case AArch64ISD::CCMP: return "AArch64ISD::CCMP"; 1138 case AArch64ISD::CCMN: return "AArch64ISD::CCMN"; 1139 case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP"; 1140 case AArch64ISD::FCMP: return "AArch64ISD::FCMP"; 1141 case AArch64ISD::DUP: return "AArch64ISD::DUP"; 1142 case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8"; 1143 case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16"; 1144 case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32"; 1145 case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64"; 1146 case AArch64ISD::MOVI: return "AArch64ISD::MOVI"; 1147 case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift"; 1148 case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit"; 1149 case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl"; 1150 case AArch64ISD::FMOV: return "AArch64ISD::FMOV"; 1151 case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift"; 1152 case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl"; 1153 case AArch64ISD::BICi: return "AArch64ISD::BICi"; 1154 case AArch64ISD::ORRi: return "AArch64ISD::ORRi"; 1155 case AArch64ISD::BSL: return "AArch64ISD::BSL"; 1156 case AArch64ISD::NEG: return "AArch64ISD::NEG"; 1157 case AArch64ISD::EXTR: return "AArch64ISD::EXTR"; 1158 case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1"; 1159 case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2"; 1160 case AArch64ISD::UZP1: return "AArch64ISD::UZP1"; 1161 case AArch64ISD::UZP2: return "AArch64ISD::UZP2"; 1162 case AArch64ISD::TRN1: return "AArch64ISD::TRN1"; 1163 case AArch64ISD::TRN2: return "AArch64ISD::TRN2"; 1164 case AArch64ISD::REV16: return "AArch64ISD::REV16"; 1165 case AArch64ISD::REV32: return "AArch64ISD::REV32"; 1166 case AArch64ISD::REV64: return "AArch64ISD::REV64"; 1167 case AArch64ISD::EXT: return "AArch64ISD::EXT"; 1168 case AArch64ISD::VSHL: return "AArch64ISD::VSHL"; 1169 case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR"; 1170 case AArch64ISD::VASHR: return "AArch64ISD::VASHR"; 1171 case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ"; 1172 case AArch64ISD::CMGE: return "AArch64ISD::CMGE"; 1173 case AArch64ISD::CMGT: return "AArch64ISD::CMGT"; 1174 case AArch64ISD::CMHI: return "AArch64ISD::CMHI"; 1175 case AArch64ISD::CMHS: return "AArch64ISD::CMHS"; 1176 case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ"; 1177 case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE"; 1178 case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT"; 1179 case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz"; 1180 case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz"; 1181 case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz"; 1182 case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz"; 1183 case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz"; 1184 case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz"; 1185 case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz"; 1186 case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz"; 1187 case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz"; 1188 case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz"; 1189 case AArch64ISD::SADDV: return "AArch64ISD::SADDV"; 1190 case AArch64ISD::UADDV: return "AArch64ISD::UADDV"; 1191 case AArch64ISD::SMINV: return "AArch64ISD::SMINV"; 1192 case AArch64ISD::UMINV: return "AArch64ISD::UMINV"; 1193 case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV"; 1194 case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV"; 1195 case AArch64ISD::NOT: return "AArch64ISD::NOT"; 1196 case AArch64ISD::BIT: return "AArch64ISD::BIT"; 1197 case AArch64ISD::CBZ: return "AArch64ISD::CBZ"; 1198 case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ"; 1199 case AArch64ISD::TBZ: return "AArch64ISD::TBZ"; 1200 case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ"; 1201 case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN"; 1202 case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH"; 1203 case AArch64ISD::SITOF: return "AArch64ISD::SITOF"; 1204 case AArch64ISD::UITOF: return "AArch64ISD::UITOF"; 1205 case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST"; 1206 case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I"; 1207 case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I"; 1208 case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I"; 1209 case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I"; 1210 case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I"; 1211 case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge"; 1212 case AArch64ISD::LD2post: return "AArch64ISD::LD2post"; 1213 case AArch64ISD::LD3post: return "AArch64ISD::LD3post"; 1214 case AArch64ISD::LD4post: return "AArch64ISD::LD4post"; 1215 case AArch64ISD::ST2post: return "AArch64ISD::ST2post"; 1216 case AArch64ISD::ST3post: return "AArch64ISD::ST3post"; 1217 case AArch64ISD::ST4post: return "AArch64ISD::ST4post"; 1218 case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post"; 1219 case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post"; 1220 case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post"; 1221 case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post"; 1222 case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post"; 1223 case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post"; 1224 case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost"; 1225 case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost"; 1226 case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost"; 1227 case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost"; 1228 case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost"; 1229 case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost"; 1230 case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost"; 1231 case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost"; 1232 case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost"; 1233 case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost"; 1234 case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost"; 1235 case AArch64ISD::SMULL: return "AArch64ISD::SMULL"; 1236 case AArch64ISD::UMULL: return "AArch64ISD::UMULL"; 1237 case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE"; 1238 case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS"; 1239 case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE"; 1240 case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS"; 1241 case AArch64ISD::STG: return "AArch64ISD::STG"; 1242 case AArch64ISD::STZG: return "AArch64ISD::STZG"; 1243 case AArch64ISD::ST2G: return "AArch64ISD::ST2G"; 1244 case AArch64ISD::STZ2G: return "AArch64ISD::STZ2G"; 1245 } 1246 return nullptr; 1247 } 1248 1249 MachineBasicBlock * 1250 AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI, 1251 MachineBasicBlock *MBB) const { 1252 // We materialise the F128CSEL pseudo-instruction as some control flow and a 1253 // phi node: 1254 1255 // OrigBB: 1256 // [... previous instrs leading to comparison ...] 1257 // b.ne TrueBB 1258 // b EndBB 1259 // TrueBB: 1260 // ; Fallthrough 1261 // EndBB: 1262 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB] 1263 1264 MachineFunction *MF = MBB->getParent(); 1265 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 1266 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 1267 DebugLoc DL = MI.getDebugLoc(); 1268 MachineFunction::iterator It = ++MBB->getIterator(); 1269 1270 unsigned DestReg = MI.getOperand(0).getReg(); 1271 unsigned IfTrueReg = MI.getOperand(1).getReg(); 1272 unsigned IfFalseReg = MI.getOperand(2).getReg(); 1273 unsigned CondCode = MI.getOperand(3).getImm(); 1274 bool NZCVKilled = MI.getOperand(4).isKill(); 1275 1276 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB); 1277 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB); 1278 MF->insert(It, TrueBB); 1279 MF->insert(It, EndBB); 1280 1281 // Transfer rest of current basic-block to EndBB 1282 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), 1283 MBB->end()); 1284 EndBB->transferSuccessorsAndUpdatePHIs(MBB); 1285 1286 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB); 1287 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB); 1288 MBB->addSuccessor(TrueBB); 1289 MBB->addSuccessor(EndBB); 1290 1291 // TrueBB falls through to the end. 1292 TrueBB->addSuccessor(EndBB); 1293 1294 if (!NZCVKilled) { 1295 TrueBB->addLiveIn(AArch64::NZCV); 1296 EndBB->addLiveIn(AArch64::NZCV); 1297 } 1298 1299 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg) 1300 .addReg(IfTrueReg) 1301 .addMBB(TrueBB) 1302 .addReg(IfFalseReg) 1303 .addMBB(MBB); 1304 1305 MI.eraseFromParent(); 1306 return EndBB; 1307 } 1308 1309 MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet( 1310 MachineInstr &MI, MachineBasicBlock *BB) const { 1311 assert(!isAsynchronousEHPersonality(classifyEHPersonality( 1312 BB->getParent()->getFunction().getPersonalityFn())) && 1313 "SEH does not use catchret!"); 1314 return BB; 1315 } 1316 1317 MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchPad( 1318 MachineInstr &MI, MachineBasicBlock *BB) const { 1319 MI.eraseFromParent(); 1320 return BB; 1321 } 1322 1323 MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( 1324 MachineInstr &MI, MachineBasicBlock *BB) const { 1325 switch (MI.getOpcode()) { 1326 default: 1327 #ifndef NDEBUG 1328 MI.dump(); 1329 #endif 1330 llvm_unreachable("Unexpected instruction for custom inserter!"); 1331 1332 case AArch64::F128CSEL: 1333 return EmitF128CSEL(MI, BB); 1334 1335 case TargetOpcode::STACKMAP: 1336 case TargetOpcode::PATCHPOINT: 1337 return emitPatchPoint(MI, BB); 1338 1339 case AArch64::CATCHRET: 1340 return EmitLoweredCatchRet(MI, BB); 1341 case AArch64::CATCHPAD: 1342 return EmitLoweredCatchPad(MI, BB); 1343 } 1344 } 1345 1346 //===----------------------------------------------------------------------===// 1347 // AArch64 Lowering private implementation. 1348 //===----------------------------------------------------------------------===// 1349 1350 //===----------------------------------------------------------------------===// 1351 // Lowering Code 1352 //===----------------------------------------------------------------------===// 1353 1354 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 1355 /// CC 1356 static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) { 1357 switch (CC) { 1358 default: 1359 llvm_unreachable("Unknown condition code!"); 1360 case ISD::SETNE: 1361 return AArch64CC::NE; 1362 case ISD::SETEQ: 1363 return AArch64CC::EQ; 1364 case ISD::SETGT: 1365 return AArch64CC::GT; 1366 case ISD::SETGE: 1367 return AArch64CC::GE; 1368 case ISD::SETLT: 1369 return AArch64CC::LT; 1370 case ISD::SETLE: 1371 return AArch64CC::LE; 1372 case ISD::SETUGT: 1373 return AArch64CC::HI; 1374 case ISD::SETUGE: 1375 return AArch64CC::HS; 1376 case ISD::SETULT: 1377 return AArch64CC::LO; 1378 case ISD::SETULE: 1379 return AArch64CC::LS; 1380 } 1381 } 1382 1383 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC. 1384 static void changeFPCCToAArch64CC(ISD::CondCode CC, 1385 AArch64CC::CondCode &CondCode, 1386 AArch64CC::CondCode &CondCode2) { 1387 CondCode2 = AArch64CC::AL; 1388 switch (CC) { 1389 default: 1390 llvm_unreachable("Unknown FP condition!"); 1391 case ISD::SETEQ: 1392 case ISD::SETOEQ: 1393 CondCode = AArch64CC::EQ; 1394 break; 1395 case ISD::SETGT: 1396 case ISD::SETOGT: 1397 CondCode = AArch64CC::GT; 1398 break; 1399 case ISD::SETGE: 1400 case ISD::SETOGE: 1401 CondCode = AArch64CC::GE; 1402 break; 1403 case ISD::SETOLT: 1404 CondCode = AArch64CC::MI; 1405 break; 1406 case ISD::SETOLE: 1407 CondCode = AArch64CC::LS; 1408 break; 1409 case ISD::SETONE: 1410 CondCode = AArch64CC::MI; 1411 CondCode2 = AArch64CC::GT; 1412 break; 1413 case ISD::SETO: 1414 CondCode = AArch64CC::VC; 1415 break; 1416 case ISD::SETUO: 1417 CondCode = AArch64CC::VS; 1418 break; 1419 case ISD::SETUEQ: 1420 CondCode = AArch64CC::EQ; 1421 CondCode2 = AArch64CC::VS; 1422 break; 1423 case ISD::SETUGT: 1424 CondCode = AArch64CC::HI; 1425 break; 1426 case ISD::SETUGE: 1427 CondCode = AArch64CC::PL; 1428 break; 1429 case ISD::SETLT: 1430 case ISD::SETULT: 1431 CondCode = AArch64CC::LT; 1432 break; 1433 case ISD::SETLE: 1434 case ISD::SETULE: 1435 CondCode = AArch64CC::LE; 1436 break; 1437 case ISD::SETNE: 1438 case ISD::SETUNE: 1439 CondCode = AArch64CC::NE; 1440 break; 1441 } 1442 } 1443 1444 /// Convert a DAG fp condition code to an AArch64 CC. 1445 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that 1446 /// should be AND'ed instead of OR'ed. 1447 static void changeFPCCToANDAArch64CC(ISD::CondCode CC, 1448 AArch64CC::CondCode &CondCode, 1449 AArch64CC::CondCode &CondCode2) { 1450 CondCode2 = AArch64CC::AL; 1451 switch (CC) { 1452 default: 1453 changeFPCCToAArch64CC(CC, CondCode, CondCode2); 1454 assert(CondCode2 == AArch64CC::AL); 1455 break; 1456 case ISD::SETONE: 1457 // (a one b) 1458 // == ((a olt b) || (a ogt b)) 1459 // == ((a ord b) && (a une b)) 1460 CondCode = AArch64CC::VC; 1461 CondCode2 = AArch64CC::NE; 1462 break; 1463 case ISD::SETUEQ: 1464 // (a ueq b) 1465 // == ((a uno b) || (a oeq b)) 1466 // == ((a ule b) && (a uge b)) 1467 CondCode = AArch64CC::PL; 1468 CondCode2 = AArch64CC::LE; 1469 break; 1470 } 1471 } 1472 1473 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 1474 /// CC usable with the vector instructions. Fewer operations are available 1475 /// without a real NZCV register, so we have to use less efficient combinations 1476 /// to get the same effect. 1477 static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, 1478 AArch64CC::CondCode &CondCode, 1479 AArch64CC::CondCode &CondCode2, 1480 bool &Invert) { 1481 Invert = false; 1482 switch (CC) { 1483 default: 1484 // Mostly the scalar mappings work fine. 1485 changeFPCCToAArch64CC(CC, CondCode, CondCode2); 1486 break; 1487 case ISD::SETUO: 1488 Invert = true; 1489 LLVM_FALLTHROUGH; 1490 case ISD::SETO: 1491 CondCode = AArch64CC::MI; 1492 CondCode2 = AArch64CC::GE; 1493 break; 1494 case ISD::SETUEQ: 1495 case ISD::SETULT: 1496 case ISD::SETULE: 1497 case ISD::SETUGT: 1498 case ISD::SETUGE: 1499 // All of the compare-mask comparisons are ordered, but we can switch 1500 // between the two by a double inversion. E.g. ULE == !OGT. 1501 Invert = true; 1502 changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2); 1503 break; 1504 } 1505 } 1506 1507 static bool isLegalArithImmed(uint64_t C) { 1508 // Matches AArch64DAGToDAGISel::SelectArithImmed(). 1509 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0); 1510 LLVM_DEBUG(dbgs() << "Is imm " << C 1511 << " legal: " << (IsLegal ? "yes\n" : "no\n")); 1512 return IsLegal; 1513 } 1514 1515 // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on 1516 // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags 1517 // can be set differently by this operation. It comes down to whether 1518 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then 1519 // everything is fine. If not then the optimization is wrong. Thus general 1520 // comparisons are only valid if op2 != 0. 1521 // 1522 // So, finally, the only LLVM-native comparisons that don't mention C and V 1523 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in 1524 // the absence of information about op2. 1525 static bool isCMN(SDValue Op, ISD::CondCode CC) { 1526 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) && 1527 (CC == ISD::SETEQ || CC == ISD::SETNE); 1528 } 1529 1530 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, 1531 const SDLoc &dl, SelectionDAG &DAG) { 1532 EVT VT = LHS.getValueType(); 1533 const bool FullFP16 = 1534 static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16(); 1535 1536 if (VT.isFloatingPoint()) { 1537 assert(VT != MVT::f128); 1538 if (VT == MVT::f16 && !FullFP16) { 1539 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); 1540 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); 1541 VT = MVT::f32; 1542 } 1543 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS); 1544 } 1545 1546 // The CMP instruction is just an alias for SUBS, and representing it as 1547 // SUBS means that it's possible to get CSE with subtract operations. 1548 // A later phase can perform the optimization of setting the destination 1549 // register to WZR/XZR if it ends up being unused. 1550 unsigned Opcode = AArch64ISD::SUBS; 1551 1552 if (isCMN(RHS, CC)) { 1553 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ? 1554 Opcode = AArch64ISD::ADDS; 1555 RHS = RHS.getOperand(1); 1556 } else if (isCMN(LHS, CC)) { 1557 // As we are looking for EQ/NE compares, the operands can be commuted ; can 1558 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ? 1559 Opcode = AArch64ISD::ADDS; 1560 LHS = LHS.getOperand(1); 1561 } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) && 1562 !isUnsignedIntSetCC(CC)) { 1563 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST 1564 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one 1565 // of the signed comparisons. 1566 Opcode = AArch64ISD::ANDS; 1567 RHS = LHS.getOperand(1); 1568 LHS = LHS.getOperand(0); 1569 } 1570 1571 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS) 1572 .getValue(1); 1573 } 1574 1575 /// \defgroup AArch64CCMP CMP;CCMP matching 1576 /// 1577 /// These functions deal with the formation of CMP;CCMP;... sequences. 1578 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of 1579 /// a comparison. They set the NZCV flags to a predefined value if their 1580 /// predicate is false. This allows to express arbitrary conjunctions, for 1581 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))" 1582 /// expressed as: 1583 /// cmp A 1584 /// ccmp B, inv(CB), CA 1585 /// check for CB flags 1586 /// 1587 /// This naturally lets us implement chains of AND operations with SETCC 1588 /// operands. And we can even implement some other situations by transforming 1589 /// them: 1590 /// - We can implement (NEG SETCC) i.e. negating a single comparison by 1591 /// negating the flags used in a CCMP/FCCMP operations. 1592 /// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations 1593 /// by negating the flags we test for afterwards. i.e. 1594 /// NEG (CMP CCMP CCCMP ...) can be implemented. 1595 /// - Note that we can only ever negate all previously processed results. 1596 /// What we can not implement by flipping the flags to test is a negation 1597 /// of two sub-trees (because the negation affects all sub-trees emitted so 1598 /// far, so the 2nd sub-tree we emit would also affect the first). 1599 /// With those tools we can implement some OR operations: 1600 /// - (OR (SETCC A) (SETCC B)) can be implemented via: 1601 /// NEG (AND (NEG (SETCC A)) (NEG (SETCC B))) 1602 /// - After transforming OR to NEG/AND combinations we may be able to use NEG 1603 /// elimination rules from earlier to implement the whole thing as a 1604 /// CCMP/FCCMP chain. 1605 /// 1606 /// As complete example: 1607 /// or (or (setCA (cmp A)) (setCB (cmp B))) 1608 /// (and (setCC (cmp C)) (setCD (cmp D)))" 1609 /// can be reassociated to: 1610 /// or (and (setCC (cmp C)) setCD (cmp D)) 1611 // (or (setCA (cmp A)) (setCB (cmp B))) 1612 /// can be transformed to: 1613 /// not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) 1614 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))" 1615 /// which can be implemented as: 1616 /// cmp C 1617 /// ccmp D, inv(CD), CC 1618 /// ccmp A, CA, inv(CD) 1619 /// ccmp B, CB, inv(CA) 1620 /// check for CB flags 1621 /// 1622 /// A counterexample is "or (and A B) (and C D)" which translates to 1623 /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we 1624 /// can only implement 1 of the inner (not) operations, but not both! 1625 /// @{ 1626 1627 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate. 1628 static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, 1629 ISD::CondCode CC, SDValue CCOp, 1630 AArch64CC::CondCode Predicate, 1631 AArch64CC::CondCode OutCC, 1632 const SDLoc &DL, SelectionDAG &DAG) { 1633 unsigned Opcode = 0; 1634 const bool FullFP16 = 1635 static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16(); 1636 1637 if (LHS.getValueType().isFloatingPoint()) { 1638 assert(LHS.getValueType() != MVT::f128); 1639 if (LHS.getValueType() == MVT::f16 && !FullFP16) { 1640 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS); 1641 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS); 1642 } 1643 Opcode = AArch64ISD::FCCMP; 1644 } else if (RHS.getOpcode() == ISD::SUB) { 1645 SDValue SubOp0 = RHS.getOperand(0); 1646 if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { 1647 // See emitComparison() on why we can only do this for SETEQ and SETNE. 1648 Opcode = AArch64ISD::CCMN; 1649 RHS = RHS.getOperand(1); 1650 } 1651 } 1652 if (Opcode == 0) 1653 Opcode = AArch64ISD::CCMP; 1654 1655 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC); 1656 AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); 1657 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); 1658 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); 1659 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp); 1660 } 1661 1662 /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be 1663 /// expressed as a conjunction. See \ref AArch64CCMP. 1664 /// \param CanNegate Set to true if we can negate the whole sub-tree just by 1665 /// changing the conditions on the SETCC tests. 1666 /// (this means we can call emitConjunctionRec() with 1667 /// Negate==true on this sub-tree) 1668 /// \param MustBeFirst Set to true if this subtree needs to be negated and we 1669 /// cannot do the negation naturally. We are required to 1670 /// emit the subtree first in this case. 1671 /// \param WillNegate Is true if are called when the result of this 1672 /// subexpression must be negated. This happens when the 1673 /// outer expression is an OR. We can use this fact to know 1674 /// that we have a double negation (or (or ...) ...) that 1675 /// can be implemented for free. 1676 static bool canEmitConjunction(const SDValue Val, bool &CanNegate, 1677 bool &MustBeFirst, bool WillNegate, 1678 unsigned Depth = 0) { 1679 if (!Val.hasOneUse()) 1680 return false; 1681 unsigned Opcode = Val->getOpcode(); 1682 if (Opcode == ISD::SETCC) { 1683 if (Val->getOperand(0).getValueType() == MVT::f128) 1684 return false; 1685 CanNegate = true; 1686 MustBeFirst = false; 1687 return true; 1688 } 1689 // Protect against exponential runtime and stack overflow. 1690 if (Depth > 6) 1691 return false; 1692 if (Opcode == ISD::AND || Opcode == ISD::OR) { 1693 bool IsOR = Opcode == ISD::OR; 1694 SDValue O0 = Val->getOperand(0); 1695 SDValue O1 = Val->getOperand(1); 1696 bool CanNegateL; 1697 bool MustBeFirstL; 1698 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1)) 1699 return false; 1700 bool CanNegateR; 1701 bool MustBeFirstR; 1702 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1)) 1703 return false; 1704 1705 if (MustBeFirstL && MustBeFirstR) 1706 return false; 1707 1708 if (IsOR) { 1709 // For an OR expression we need to be able to naturally negate at least 1710 // one side or we cannot do the transformation at all. 1711 if (!CanNegateL && !CanNegateR) 1712 return false; 1713 // If we the result of the OR will be negated and we can naturally negate 1714 // the leafs, then this sub-tree as a whole negates naturally. 1715 CanNegate = WillNegate && CanNegateL && CanNegateR; 1716 // If we cannot naturally negate the whole sub-tree, then this must be 1717 // emitted first. 1718 MustBeFirst = !CanNegate; 1719 } else { 1720 assert(Opcode == ISD::AND && "Must be OR or AND"); 1721 // We cannot naturally negate an AND operation. 1722 CanNegate = false; 1723 MustBeFirst = MustBeFirstL || MustBeFirstR; 1724 } 1725 return true; 1726 } 1727 return false; 1728 } 1729 1730 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain 1731 /// of CCMP/CFCMP ops. See @ref AArch64CCMP. 1732 /// Tries to transform the given i1 producing node @p Val to a series compare 1733 /// and conditional compare operations. @returns an NZCV flags producing node 1734 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if 1735 /// transformation was not possible. 1736 /// \p Negate is true if we want this sub-tree being negated just by changing 1737 /// SETCC conditions. 1738 static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, 1739 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, 1740 AArch64CC::CondCode Predicate) { 1741 // We're at a tree leaf, produce a conditional comparison operation. 1742 unsigned Opcode = Val->getOpcode(); 1743 if (Opcode == ISD::SETCC) { 1744 SDValue LHS = Val->getOperand(0); 1745 SDValue RHS = Val->getOperand(1); 1746 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get(); 1747 bool isInteger = LHS.getValueType().isInteger(); 1748 if (Negate) 1749 CC = getSetCCInverse(CC, isInteger); 1750 SDLoc DL(Val); 1751 // Determine OutCC and handle FP special case. 1752 if (isInteger) { 1753 OutCC = changeIntCCToAArch64CC(CC); 1754 } else { 1755 assert(LHS.getValueType().isFloatingPoint()); 1756 AArch64CC::CondCode ExtraCC; 1757 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC); 1758 // Some floating point conditions can't be tested with a single condition 1759 // code. Construct an additional comparison in this case. 1760 if (ExtraCC != AArch64CC::AL) { 1761 SDValue ExtraCmp; 1762 if (!CCOp.getNode()) 1763 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG); 1764 else 1765 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, 1766 ExtraCC, DL, DAG); 1767 CCOp = ExtraCmp; 1768 Predicate = ExtraCC; 1769 } 1770 } 1771 1772 // Produce a normal comparison if we are first in the chain 1773 if (!CCOp) 1774 return emitComparison(LHS, RHS, CC, DL, DAG); 1775 // Otherwise produce a ccmp. 1776 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL, 1777 DAG); 1778 } 1779 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree"); 1780 1781 bool IsOR = Opcode == ISD::OR; 1782 1783 SDValue LHS = Val->getOperand(0); 1784 bool CanNegateL; 1785 bool MustBeFirstL; 1786 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR); 1787 assert(ValidL && "Valid conjunction/disjunction tree"); 1788 (void)ValidL; 1789 1790 SDValue RHS = Val->getOperand(1); 1791 bool CanNegateR; 1792 bool MustBeFirstR; 1793 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR); 1794 assert(ValidR && "Valid conjunction/disjunction tree"); 1795 (void)ValidR; 1796 1797 // Swap sub-tree that must come first to the right side. 1798 if (MustBeFirstL) { 1799 assert(!MustBeFirstR && "Valid conjunction/disjunction tree"); 1800 std::swap(LHS, RHS); 1801 std::swap(CanNegateL, CanNegateR); 1802 std::swap(MustBeFirstL, MustBeFirstR); 1803 } 1804 1805 bool NegateR; 1806 bool NegateAfterR; 1807 bool NegateL; 1808 bool NegateAfterAll; 1809 if (Opcode == ISD::OR) { 1810 // Swap the sub-tree that we can negate naturally to the left. 1811 if (!CanNegateL) { 1812 assert(CanNegateR && "at least one side must be negatable"); 1813 assert(!MustBeFirstR && "invalid conjunction/disjunction tree"); 1814 assert(!Negate); 1815 std::swap(LHS, RHS); 1816 NegateR = false; 1817 NegateAfterR = true; 1818 } else { 1819 // Negate the left sub-tree if possible, otherwise negate the result. 1820 NegateR = CanNegateR; 1821 NegateAfterR = !CanNegateR; 1822 } 1823 NegateL = true; 1824 NegateAfterAll = !Negate; 1825 } else { 1826 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree"); 1827 assert(!Negate && "Valid conjunction/disjunction tree"); 1828 1829 NegateL = false; 1830 NegateR = false; 1831 NegateAfterR = false; 1832 NegateAfterAll = false; 1833 } 1834 1835 // Emit sub-trees. 1836 AArch64CC::CondCode RHSCC; 1837 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate); 1838 if (NegateAfterR) 1839 RHSCC = AArch64CC::getInvertedCondCode(RHSCC); 1840 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC); 1841 if (NegateAfterAll) 1842 OutCC = AArch64CC::getInvertedCondCode(OutCC); 1843 return CmpL; 1844 } 1845 1846 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops). 1847 /// In some cases this is even possible with OR operations in the expression. 1848 /// See \ref AArch64CCMP. 1849 /// \see emitConjunctionRec(). 1850 static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, 1851 AArch64CC::CondCode &OutCC) { 1852 bool DummyCanNegate; 1853 bool DummyMustBeFirst; 1854 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false)) 1855 return SDValue(); 1856 1857 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL); 1858 } 1859 1860 /// @} 1861 1862 /// Returns how profitable it is to fold a comparison's operand's shift and/or 1863 /// extension operations. 1864 static unsigned getCmpOperandFoldingProfit(SDValue Op) { 1865 auto isSupportedExtend = [&](SDValue V) { 1866 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG) 1867 return true; 1868 1869 if (V.getOpcode() == ISD::AND) 1870 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) { 1871 uint64_t Mask = MaskCst->getZExtValue(); 1872 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF); 1873 } 1874 1875 return false; 1876 }; 1877 1878 if (!Op.hasOneUse()) 1879 return 0; 1880 1881 if (isSupportedExtend(Op)) 1882 return 1; 1883 1884 unsigned Opc = Op.getOpcode(); 1885 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA) 1886 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 1887 uint64_t Shift = ShiftCst->getZExtValue(); 1888 if (isSupportedExtend(Op.getOperand(0))) 1889 return (Shift <= 4) ? 2 : 1; 1890 EVT VT = Op.getValueType(); 1891 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63)) 1892 return 1; 1893 } 1894 1895 return 0; 1896 } 1897 1898 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, 1899 SDValue &AArch64cc, SelectionDAG &DAG, 1900 const SDLoc &dl) { 1901 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 1902 EVT VT = RHS.getValueType(); 1903 uint64_t C = RHSC->getZExtValue(); 1904 if (!isLegalArithImmed(C)) { 1905 // Constant does not fit, try adjusting it by one? 1906 switch (CC) { 1907 default: 1908 break; 1909 case ISD::SETLT: 1910 case ISD::SETGE: 1911 if ((VT == MVT::i32 && C != 0x80000000 && 1912 isLegalArithImmed((uint32_t)(C - 1))) || 1913 (VT == MVT::i64 && C != 0x80000000ULL && 1914 isLegalArithImmed(C - 1ULL))) { 1915 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 1916 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; 1917 RHS = DAG.getConstant(C, dl, VT); 1918 } 1919 break; 1920 case ISD::SETULT: 1921 case ISD::SETUGE: 1922 if ((VT == MVT::i32 && C != 0 && 1923 isLegalArithImmed((uint32_t)(C - 1))) || 1924 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) { 1925 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 1926 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; 1927 RHS = DAG.getConstant(C, dl, VT); 1928 } 1929 break; 1930 case ISD::SETLE: 1931 case ISD::SETGT: 1932 if ((VT == MVT::i32 && C != INT32_MAX && 1933 isLegalArithImmed((uint32_t)(C + 1))) || 1934 (VT == MVT::i64 && C != INT64_MAX && 1935 isLegalArithImmed(C + 1ULL))) { 1936 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 1937 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; 1938 RHS = DAG.getConstant(C, dl, VT); 1939 } 1940 break; 1941 case ISD::SETULE: 1942 case ISD::SETUGT: 1943 if ((VT == MVT::i32 && C != UINT32_MAX && 1944 isLegalArithImmed((uint32_t)(C + 1))) || 1945 (VT == MVT::i64 && C != UINT64_MAX && 1946 isLegalArithImmed(C + 1ULL))) { 1947 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 1948 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; 1949 RHS = DAG.getConstant(C, dl, VT); 1950 } 1951 break; 1952 } 1953 } 1954 } 1955 1956 // Comparisons are canonicalized so that the RHS operand is simpler than the 1957 // LHS one, the extreme case being when RHS is an immediate. However, AArch64 1958 // can fold some shift+extend operations on the RHS operand, so swap the 1959 // operands if that can be done. 1960 // 1961 // For example: 1962 // lsl w13, w11, #1 1963 // cmp w13, w12 1964 // can be turned into: 1965 // cmp w12, w11, lsl #1 1966 if (!isa<ConstantSDNode>(RHS) || 1967 !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) { 1968 SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS; 1969 1970 if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) { 1971 std::swap(LHS, RHS); 1972 CC = ISD::getSetCCSwappedOperands(CC); 1973 } 1974 } 1975 1976 SDValue Cmp; 1977 AArch64CC::CondCode AArch64CC; 1978 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) { 1979 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS); 1980 1981 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. 1982 // For the i8 operand, the largest immediate is 255, so this can be easily 1983 // encoded in the compare instruction. For the i16 operand, however, the 1984 // largest immediate cannot be encoded in the compare. 1985 // Therefore, use a sign extending load and cmn to avoid materializing the 1986 // -1 constant. For example, 1987 // movz w1, #65535 1988 // ldrh w0, [x0, #0] 1989 // cmp w0, w1 1990 // > 1991 // ldrsh w0, [x0, #0] 1992 // cmn w0, #1 1993 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) 1994 // if and only if (sext LHS) == (sext RHS). The checks are in place to 1995 // ensure both the LHS and RHS are truly zero extended and to make sure the 1996 // transformation is profitable. 1997 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) && 1998 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD && 1999 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 && 2000 LHS.getNode()->hasNUsesOfValue(1, 0)) { 2001 int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue(); 2002 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { 2003 SDValue SExt = 2004 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, 2005 DAG.getValueType(MVT::i16)); 2006 Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl, 2007 RHS.getValueType()), 2008 CC, dl, DAG); 2009 AArch64CC = changeIntCCToAArch64CC(CC); 2010 } 2011 } 2012 2013 if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) { 2014 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) { 2015 if ((CC == ISD::SETNE) ^ RHSC->isNullValue()) 2016 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC); 2017 } 2018 } 2019 } 2020 2021 if (!Cmp) { 2022 Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 2023 AArch64CC = changeIntCCToAArch64CC(CC); 2024 } 2025 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC); 2026 return Cmp; 2027 } 2028 2029 static std::pair<SDValue, SDValue> 2030 getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { 2031 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) && 2032 "Unsupported value type"); 2033 SDValue Value, Overflow; 2034 SDLoc DL(Op); 2035 SDValue LHS = Op.getOperand(0); 2036 SDValue RHS = Op.getOperand(1); 2037 unsigned Opc = 0; 2038 switch (Op.getOpcode()) { 2039 default: 2040 llvm_unreachable("Unknown overflow instruction!"); 2041 case ISD::SADDO: 2042 Opc = AArch64ISD::ADDS; 2043 CC = AArch64CC::VS; 2044 break; 2045 case ISD::UADDO: 2046 Opc = AArch64ISD::ADDS; 2047 CC = AArch64CC::HS; 2048 break; 2049 case ISD::SSUBO: 2050 Opc = AArch64ISD::SUBS; 2051 CC = AArch64CC::VS; 2052 break; 2053 case ISD::USUBO: 2054 Opc = AArch64ISD::SUBS; 2055 CC = AArch64CC::LO; 2056 break; 2057 // Multiply needs a little bit extra work. 2058 case ISD::SMULO: 2059 case ISD::UMULO: { 2060 CC = AArch64CC::NE; 2061 bool IsSigned = Op.getOpcode() == ISD::SMULO; 2062 if (Op.getValueType() == MVT::i32) { 2063 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 2064 // For a 32 bit multiply with overflow check we want the instruction 2065 // selector to generate a widening multiply (SMADDL/UMADDL). For that we 2066 // need to generate the following pattern: 2067 // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b)) 2068 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS); 2069 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS); 2070 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); 2071 SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul, 2072 DAG.getConstant(0, DL, MVT::i64)); 2073 // On AArch64 the upper 32 bits are always zero extended for a 32 bit 2074 // operation. We need to clear out the upper 32 bits, because we used a 2075 // widening multiply that wrote all 64 bits. In the end this should be a 2076 // noop. 2077 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add); 2078 if (IsSigned) { 2079 // The signed overflow check requires more than just a simple check for 2080 // any bit set in the upper 32 bits of the result. These bits could be 2081 // just the sign bits of a negative number. To perform the overflow 2082 // check we have to arithmetic shift right the 32nd bit of the result by 2083 // 31 bits. Then we compare the result to the upper 32 bits. 2084 SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add, 2085 DAG.getConstant(32, DL, MVT::i64)); 2086 UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits); 2087 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value, 2088 DAG.getConstant(31, DL, MVT::i64)); 2089 // It is important that LowerBits is last, otherwise the arithmetic 2090 // shift will not be folded into the compare (SUBS). 2091 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32); 2092 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) 2093 .getValue(1); 2094 } else { 2095 // The overflow check for unsigned multiply is easy. We only need to 2096 // check if any of the upper 32 bits are set. This can be done with a 2097 // CMP (shifted register). For that we need to generate the following 2098 // pattern: 2099 // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32) 2100 SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul, 2101 DAG.getConstant(32, DL, MVT::i64)); 2102 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 2103 Overflow = 2104 DAG.getNode(AArch64ISD::SUBS, DL, VTs, 2105 DAG.getConstant(0, DL, MVT::i64), 2106 UpperBits).getValue(1); 2107 } 2108 break; 2109 } 2110 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type"); 2111 // For the 64 bit multiply 2112 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); 2113 if (IsSigned) { 2114 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS); 2115 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value, 2116 DAG.getConstant(63, DL, MVT::i64)); 2117 // It is important that LowerBits is last, otherwise the arithmetic 2118 // shift will not be folded into the compare (SUBS). 2119 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 2120 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) 2121 .getValue(1); 2122 } else { 2123 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS); 2124 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 2125 Overflow = 2126 DAG.getNode(AArch64ISD::SUBS, DL, VTs, 2127 DAG.getConstant(0, DL, MVT::i64), 2128 UpperBits).getValue(1); 2129 } 2130 break; 2131 } 2132 } // switch (...) 2133 2134 if (Opc) { 2135 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32); 2136 2137 // Emit the AArch64 operation with overflow check. 2138 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS); 2139 Overflow = Value.getValue(1); 2140 } 2141 return std::make_pair(Value, Overflow); 2142 } 2143 2144 SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, 2145 RTLIB::Libcall Call) const { 2146 SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); 2147 return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first; 2148 } 2149 2150 // Returns true if the given Op is the overflow flag result of an overflow 2151 // intrinsic operation. 2152 static bool isOverflowIntrOpRes(SDValue Op) { 2153 unsigned Opc = Op.getOpcode(); 2154 return (Op.getResNo() == 1 && 2155 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 2156 Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)); 2157 } 2158 2159 static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { 2160 SDValue Sel = Op.getOperand(0); 2161 SDValue Other = Op.getOperand(1); 2162 SDLoc dl(Sel); 2163 2164 // If the operand is an overflow checking operation, invert the condition 2165 // code and kill the Not operation. I.e., transform: 2166 // (xor (overflow_op_bool, 1)) 2167 // --> 2168 // (csel 1, 0, invert(cc), overflow_op_bool) 2169 // ... which later gets transformed to just a cset instruction with an 2170 // inverted condition code, rather than a cset + eor sequence. 2171 if (isOneConstant(Other) && isOverflowIntrOpRes(Sel)) { 2172 // Only lower legal XALUO ops. 2173 if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0))) 2174 return SDValue(); 2175 2176 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 2177 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 2178 AArch64CC::CondCode CC; 2179 SDValue Value, Overflow; 2180 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG); 2181 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32); 2182 return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal, 2183 CCVal, Overflow); 2184 } 2185 // If neither operand is a SELECT_CC, give up. 2186 if (Sel.getOpcode() != ISD::SELECT_CC) 2187 std::swap(Sel, Other); 2188 if (Sel.getOpcode() != ISD::SELECT_CC) 2189 return Op; 2190 2191 // The folding we want to perform is: 2192 // (xor x, (select_cc a, b, cc, 0, -1) ) 2193 // --> 2194 // (csel x, (xor x, -1), cc ...) 2195 // 2196 // The latter will get matched to a CSINV instruction. 2197 2198 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get(); 2199 SDValue LHS = Sel.getOperand(0); 2200 SDValue RHS = Sel.getOperand(1); 2201 SDValue TVal = Sel.getOperand(2); 2202 SDValue FVal = Sel.getOperand(3); 2203 2204 // FIXME: This could be generalized to non-integer comparisons. 2205 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) 2206 return Op; 2207 2208 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); 2209 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); 2210 2211 // The values aren't constants, this isn't the pattern we're looking for. 2212 if (!CFVal || !CTVal) 2213 return Op; 2214 2215 // We can commute the SELECT_CC by inverting the condition. This 2216 // might be needed to make this fit into a CSINV pattern. 2217 if (CTVal->isAllOnesValue() && CFVal->isNullValue()) { 2218 std::swap(TVal, FVal); 2219 std::swap(CTVal, CFVal); 2220 CC = ISD::getSetCCInverse(CC, true); 2221 } 2222 2223 // If the constants line up, perform the transform! 2224 if (CTVal->isNullValue() && CFVal->isAllOnesValue()) { 2225 SDValue CCVal; 2226 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 2227 2228 FVal = Other; 2229 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other, 2230 DAG.getConstant(-1ULL, dl, Other.getValueType())); 2231 2232 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal, 2233 CCVal, Cmp); 2234 } 2235 2236 return Op; 2237 } 2238 2239 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 2240 EVT VT = Op.getValueType(); 2241 2242 // Let legalize expand this if it isn't a legal type yet. 2243 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 2244 return SDValue(); 2245 2246 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 2247 2248 unsigned Opc; 2249 bool ExtraOp = false; 2250 switch (Op.getOpcode()) { 2251 default: 2252 llvm_unreachable("Invalid code"); 2253 case ISD::ADDC: 2254 Opc = AArch64ISD::ADDS; 2255 break; 2256 case ISD::SUBC: 2257 Opc = AArch64ISD::SUBS; 2258 break; 2259 case ISD::ADDE: 2260 Opc = AArch64ISD::ADCS; 2261 ExtraOp = true; 2262 break; 2263 case ISD::SUBE: 2264 Opc = AArch64ISD::SBCS; 2265 ExtraOp = true; 2266 break; 2267 } 2268 2269 if (!ExtraOp) 2270 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1)); 2271 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1), 2272 Op.getOperand(2)); 2273 } 2274 2275 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { 2276 // Let legalize expand this if it isn't a legal type yet. 2277 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 2278 return SDValue(); 2279 2280 SDLoc dl(Op); 2281 AArch64CC::CondCode CC; 2282 // The actual operation that sets the overflow or carry flag. 2283 SDValue Value, Overflow; 2284 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG); 2285 2286 // We use 0 and 1 as false and true values. 2287 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 2288 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 2289 2290 // We use an inverted condition, because the conditional select is inverted 2291 // too. This will allow it to be selected to a single instruction: 2292 // CSINC Wd, WZR, WZR, invert(cond). 2293 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32); 2294 Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal, 2295 CCVal, Overflow); 2296 2297 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 2298 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 2299 } 2300 2301 // Prefetch operands are: 2302 // 1: Address to prefetch 2303 // 2: bool isWrite 2304 // 3: int locality (0 = no locality ... 3 = extreme locality) 2305 // 4: bool isDataCache 2306 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) { 2307 SDLoc DL(Op); 2308 unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 2309 unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 2310 unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 2311 2312 bool IsStream = !Locality; 2313 // When the locality number is set 2314 if (Locality) { 2315 // The front-end should have filtered out the out-of-range values 2316 assert(Locality <= 3 && "Prefetch locality out-of-range"); 2317 // The locality degree is the opposite of the cache speed. 2318 // Put the number the other way around. 2319 // The encoding starts at 0 for level 1 2320 Locality = 3 - Locality; 2321 } 2322 2323 // built the mask value encoding the expected behavior. 2324 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit 2325 (!IsData << 3) | // IsDataCache bit 2326 (Locality << 1) | // Cache level bits 2327 (unsigned)IsStream; // Stream bit 2328 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0), 2329 DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1)); 2330 } 2331 2332 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, 2333 SelectionDAG &DAG) const { 2334 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); 2335 2336 RTLIB::Libcall LC; 2337 LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); 2338 2339 return LowerF128Call(Op, DAG, LC); 2340 } 2341 2342 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, 2343 SelectionDAG &DAG) const { 2344 if (Op.getOperand(0).getValueType() != MVT::f128) { 2345 // It's legal except when f128 is involved 2346 return Op; 2347 } 2348 2349 RTLIB::Libcall LC; 2350 LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); 2351 2352 // FP_ROUND node has a second operand indicating whether it is known to be 2353 // precise. That doesn't take part in the LibCall so we can't directly use 2354 // LowerF128Call. 2355 SDValue SrcVal = Op.getOperand(0); 2356 return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, 2357 SDLoc(Op)).first; 2358 } 2359 2360 SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, 2361 SelectionDAG &DAG) const { 2362 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. 2363 // Any additional optimization in this function should be recorded 2364 // in the cost tables. 2365 EVT InVT = Op.getOperand(0).getValueType(); 2366 EVT VT = Op.getValueType(); 2367 unsigned NumElts = InVT.getVectorNumElements(); 2368 2369 // f16 conversions are promoted to f32 when full fp16 is not supported. 2370 if (InVT.getVectorElementType() == MVT::f16 && 2371 !Subtarget->hasFullFP16()) { 2372 MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts); 2373 SDLoc dl(Op); 2374 return DAG.getNode( 2375 Op.getOpcode(), dl, Op.getValueType(), 2376 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0))); 2377 } 2378 2379 if (VT.getSizeInBits() < InVT.getSizeInBits()) { 2380 SDLoc dl(Op); 2381 SDValue Cv = 2382 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(), 2383 Op.getOperand(0)); 2384 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv); 2385 } 2386 2387 if (VT.getSizeInBits() > InVT.getSizeInBits()) { 2388 SDLoc dl(Op); 2389 MVT ExtVT = 2390 MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()), 2391 VT.getVectorNumElements()); 2392 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0)); 2393 return DAG.getNode(Op.getOpcode(), dl, VT, Ext); 2394 } 2395 2396 // Type changing conversions are illegal. 2397 return Op; 2398 } 2399 2400 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, 2401 SelectionDAG &DAG) const { 2402 if (Op.getOperand(0).getValueType().isVector()) 2403 return LowerVectorFP_TO_INT(Op, DAG); 2404 2405 // f16 conversions are promoted to f32 when full fp16 is not supported. 2406 if (Op.getOperand(0).getValueType() == MVT::f16 && 2407 !Subtarget->hasFullFP16()) { 2408 SDLoc dl(Op); 2409 return DAG.getNode( 2410 Op.getOpcode(), dl, Op.getValueType(), 2411 DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0))); 2412 } 2413 2414 if (Op.getOperand(0).getValueType() != MVT::f128) { 2415 // It's legal except when f128 is involved 2416 return Op; 2417 } 2418 2419 RTLIB::Libcall LC; 2420 if (Op.getOpcode() == ISD::FP_TO_SINT) 2421 LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType()); 2422 else 2423 LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); 2424 2425 SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); 2426 return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first; 2427 } 2428 2429 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 2430 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. 2431 // Any additional optimization in this function should be recorded 2432 // in the cost tables. 2433 EVT VT = Op.getValueType(); 2434 SDLoc dl(Op); 2435 SDValue In = Op.getOperand(0); 2436 EVT InVT = In.getValueType(); 2437 2438 if (VT.getSizeInBits() < InVT.getSizeInBits()) { 2439 MVT CastVT = 2440 MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()), 2441 InVT.getVectorNumElements()); 2442 In = DAG.getNode(Op.getOpcode(), dl, CastVT, In); 2443 return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl)); 2444 } 2445 2446 if (VT.getSizeInBits() > InVT.getSizeInBits()) { 2447 unsigned CastOpc = 2448 Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 2449 EVT CastVT = VT.changeVectorElementTypeToInteger(); 2450 In = DAG.getNode(CastOpc, dl, CastVT, In); 2451 return DAG.getNode(Op.getOpcode(), dl, VT, In); 2452 } 2453 2454 return Op; 2455 } 2456 2457 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, 2458 SelectionDAG &DAG) const { 2459 if (Op.getValueType().isVector()) 2460 return LowerVectorINT_TO_FP(Op, DAG); 2461 2462 // f16 conversions are promoted to f32 when full fp16 is not supported. 2463 if (Op.getValueType() == MVT::f16 && 2464 !Subtarget->hasFullFP16()) { 2465 SDLoc dl(Op); 2466 return DAG.getNode( 2467 ISD::FP_ROUND, dl, MVT::f16, 2468 DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)), 2469 DAG.getIntPtrConstant(0, dl)); 2470 } 2471 2472 // i128 conversions are libcalls. 2473 if (Op.getOperand(0).getValueType() == MVT::i128) 2474 return SDValue(); 2475 2476 // Other conversions are legal, unless it's to the completely software-based 2477 // fp128. 2478 if (Op.getValueType() != MVT::f128) 2479 return Op; 2480 2481 RTLIB::Libcall LC; 2482 if (Op.getOpcode() == ISD::SINT_TO_FP) 2483 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); 2484 else 2485 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); 2486 2487 return LowerF128Call(Op, DAG, LC); 2488 } 2489 2490 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, 2491 SelectionDAG &DAG) const { 2492 // For iOS, we want to call an alternative entry point: __sincos_stret, 2493 // which returns the values in two S / D registers. 2494 SDLoc dl(Op); 2495 SDValue Arg = Op.getOperand(0); 2496 EVT ArgVT = Arg.getValueType(); 2497 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 2498 2499 ArgListTy Args; 2500 ArgListEntry Entry; 2501 2502 Entry.Node = Arg; 2503 Entry.Ty = ArgTy; 2504 Entry.IsSExt = false; 2505 Entry.IsZExt = false; 2506 Args.push_back(Entry); 2507 2508 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64 2509 : RTLIB::SINCOS_STRET_F32; 2510 const char *LibcallName = getLibcallName(LC); 2511 SDValue Callee = 2512 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); 2513 2514 StructType *RetTy = StructType::get(ArgTy, ArgTy); 2515 TargetLowering::CallLoweringInfo CLI(DAG); 2516 CLI.setDebugLoc(dl) 2517 .setChain(DAG.getEntryNode()) 2518 .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args)); 2519 2520 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 2521 return CallResult.first; 2522 } 2523 2524 static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) { 2525 if (Op.getValueType() != MVT::f16) 2526 return SDValue(); 2527 2528 assert(Op.getOperand(0).getValueType() == MVT::i16); 2529 SDLoc DL(Op); 2530 2531 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0)); 2532 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op); 2533 return SDValue( 2534 DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op, 2535 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), 2536 0); 2537 } 2538 2539 static EVT getExtensionTo64Bits(const EVT &OrigVT) { 2540 if (OrigVT.getSizeInBits() >= 64) 2541 return OrigVT; 2542 2543 assert(OrigVT.isSimple() && "Expecting a simple value type"); 2544 2545 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; 2546 switch (OrigSimpleTy) { 2547 default: llvm_unreachable("Unexpected Vector Type"); 2548 case MVT::v2i8: 2549 case MVT::v2i16: 2550 return MVT::v2i32; 2551 case MVT::v4i8: 2552 return MVT::v4i16; 2553 } 2554 } 2555 2556 static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, 2557 const EVT &OrigTy, 2558 const EVT &ExtTy, 2559 unsigned ExtOpcode) { 2560 // The vector originally had a size of OrigTy. It was then extended to ExtTy. 2561 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than 2562 // 64-bits we need to insert a new extension so that it will be 64-bits. 2563 assert(ExtTy.is128BitVector() && "Unexpected extension size"); 2564 if (OrigTy.getSizeInBits() >= 64) 2565 return N; 2566 2567 // Must extend size to at least 64 bits to be used as an operand for VMULL. 2568 EVT NewVT = getExtensionTo64Bits(OrigTy); 2569 2570 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); 2571 } 2572 2573 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, 2574 bool isSigned) { 2575 EVT VT = N->getValueType(0); 2576 2577 if (N->getOpcode() != ISD::BUILD_VECTOR) 2578 return false; 2579 2580 for (const SDValue &Elt : N->op_values()) { 2581 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { 2582 unsigned EltSize = VT.getScalarSizeInBits(); 2583 unsigned HalfSize = EltSize / 2; 2584 if (isSigned) { 2585 if (!isIntN(HalfSize, C->getSExtValue())) 2586 return false; 2587 } else { 2588 if (!isUIntN(HalfSize, C->getZExtValue())) 2589 return false; 2590 } 2591 continue; 2592 } 2593 return false; 2594 } 2595 2596 return true; 2597 } 2598 2599 static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { 2600 if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) 2601 return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG, 2602 N->getOperand(0)->getValueType(0), 2603 N->getValueType(0), 2604 N->getOpcode()); 2605 2606 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); 2607 EVT VT = N->getValueType(0); 2608 SDLoc dl(N); 2609 unsigned EltSize = VT.getScalarSizeInBits() / 2; 2610 unsigned NumElts = VT.getVectorNumElements(); 2611 MVT TruncVT = MVT::getIntegerVT(EltSize); 2612 SmallVector<SDValue, 8> Ops; 2613 for (unsigned i = 0; i != NumElts; ++i) { 2614 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); 2615 const APInt &CInt = C->getAPIntValue(); 2616 // Element types smaller than 32 bits are not legal, so use i32 elements. 2617 // The values are implicitly truncated so sext vs. zext doesn't matter. 2618 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); 2619 } 2620 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); 2621 } 2622 2623 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { 2624 return N->getOpcode() == ISD::SIGN_EXTEND || 2625 isExtendedBUILD_VECTOR(N, DAG, true); 2626 } 2627 2628 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { 2629 return N->getOpcode() == ISD::ZERO_EXTEND || 2630 isExtendedBUILD_VECTOR(N, DAG, false); 2631 } 2632 2633 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { 2634 unsigned Opcode = N->getOpcode(); 2635 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 2636 SDNode *N0 = N->getOperand(0).getNode(); 2637 SDNode *N1 = N->getOperand(1).getNode(); 2638 return N0->hasOneUse() && N1->hasOneUse() && 2639 isSignExtended(N0, DAG) && isSignExtended(N1, DAG); 2640 } 2641 return false; 2642 } 2643 2644 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { 2645 unsigned Opcode = N->getOpcode(); 2646 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 2647 SDNode *N0 = N->getOperand(0).getNode(); 2648 SDNode *N1 = N->getOperand(1).getNode(); 2649 return N0->hasOneUse() && N1->hasOneUse() && 2650 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); 2651 } 2652 return false; 2653 } 2654 2655 SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 2656 SelectionDAG &DAG) const { 2657 // The rounding mode is in bits 23:22 of the FPSCR. 2658 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 2659 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) 2660 // so that the shift + and get folded into a bitfield extract. 2661 SDLoc dl(Op); 2662 2663 SDValue FPCR_64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i64, 2664 DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, 2665 MVT::i64)); 2666 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64); 2667 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32, 2668 DAG.getConstant(1U << 22, dl, MVT::i32)); 2669 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, 2670 DAG.getConstant(22, dl, MVT::i32)); 2671 return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, 2672 DAG.getConstant(3, dl, MVT::i32)); 2673 } 2674 2675 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { 2676 // Multiplications are only custom-lowered for 128-bit vectors so that 2677 // VMULL can be detected. Otherwise v2i64 multiplications are not legal. 2678 EVT VT = Op.getValueType(); 2679 assert(VT.is128BitVector() && VT.isInteger() && 2680 "unexpected type for custom-lowering ISD::MUL"); 2681 SDNode *N0 = Op.getOperand(0).getNode(); 2682 SDNode *N1 = Op.getOperand(1).getNode(); 2683 unsigned NewOpc = 0; 2684 bool isMLA = false; 2685 bool isN0SExt = isSignExtended(N0, DAG); 2686 bool isN1SExt = isSignExtended(N1, DAG); 2687 if (isN0SExt && isN1SExt) 2688 NewOpc = AArch64ISD::SMULL; 2689 else { 2690 bool isN0ZExt = isZeroExtended(N0, DAG); 2691 bool isN1ZExt = isZeroExtended(N1, DAG); 2692 if (isN0ZExt && isN1ZExt) 2693 NewOpc = AArch64ISD::UMULL; 2694 else if (isN1SExt || isN1ZExt) { 2695 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these 2696 // into (s/zext A * s/zext C) + (s/zext B * s/zext C) 2697 if (isN1SExt && isAddSubSExt(N0, DAG)) { 2698 NewOpc = AArch64ISD::SMULL; 2699 isMLA = true; 2700 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { 2701 NewOpc = AArch64ISD::UMULL; 2702 isMLA = true; 2703 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { 2704 std::swap(N0, N1); 2705 NewOpc = AArch64ISD::UMULL; 2706 isMLA = true; 2707 } 2708 } 2709 2710 if (!NewOpc) { 2711 if (VT == MVT::v2i64) 2712 // Fall through to expand this. It is not legal. 2713 return SDValue(); 2714 else 2715 // Other vector multiplications are legal. 2716 return Op; 2717 } 2718 } 2719 2720 // Legalize to a S/UMULL instruction 2721 SDLoc DL(Op); 2722 SDValue Op0; 2723 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG); 2724 if (!isMLA) { 2725 Op0 = skipExtensionForVectorMULL(N0, DAG); 2726 assert(Op0.getValueType().is64BitVector() && 2727 Op1.getValueType().is64BitVector() && 2728 "unexpected types for extended operands to VMULL"); 2729 return DAG.getNode(NewOpc, DL, VT, Op0, Op1); 2730 } 2731 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during 2732 // isel lowering to take advantage of no-stall back to back s/umul + s/umla. 2733 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57 2734 SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG); 2735 SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG); 2736 EVT Op1VT = Op1.getValueType(); 2737 return DAG.getNode(N0->getOpcode(), DL, VT, 2738 DAG.getNode(NewOpc, DL, VT, 2739 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), 2740 DAG.getNode(NewOpc, DL, VT, 2741 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); 2742 } 2743 2744 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 2745 SelectionDAG &DAG) const { 2746 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 2747 SDLoc dl(Op); 2748 switch (IntNo) { 2749 default: return SDValue(); // Don't custom lower most intrinsics. 2750 case Intrinsic::thread_pointer: { 2751 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 2752 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT); 2753 } 2754 case Intrinsic::aarch64_neon_abs: { 2755 EVT Ty = Op.getValueType(); 2756 if (Ty == MVT::i64) { 2757 SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, 2758 Op.getOperand(1)); 2759 Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result); 2760 return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result); 2761 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) { 2762 return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1)); 2763 } else { 2764 report_fatal_error("Unexpected type for AArch64 NEON intrinic"); 2765 } 2766 } 2767 case Intrinsic::aarch64_neon_smax: 2768 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(), 2769 Op.getOperand(1), Op.getOperand(2)); 2770 case Intrinsic::aarch64_neon_umax: 2771 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(), 2772 Op.getOperand(1), Op.getOperand(2)); 2773 case Intrinsic::aarch64_neon_smin: 2774 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(), 2775 Op.getOperand(1), Op.getOperand(2)); 2776 case Intrinsic::aarch64_neon_umin: 2777 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(), 2778 Op.getOperand(1), Op.getOperand(2)); 2779 2780 case Intrinsic::localaddress: { 2781 const auto &MF = DAG.getMachineFunction(); 2782 const auto *RegInfo = Subtarget->getRegisterInfo(); 2783 unsigned Reg = RegInfo->getLocalAddressRegister(MF); 2784 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, 2785 Op.getSimpleValueType()); 2786 } 2787 2788 case Intrinsic::eh_recoverfp: { 2789 // FIXME: This needs to be implemented to correctly handle highly aligned 2790 // stack objects. For now we simply return the incoming FP. Refer D53541 2791 // for more details. 2792 SDValue FnOp = Op.getOperand(1); 2793 SDValue IncomingFPOp = Op.getOperand(2); 2794 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp); 2795 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr); 2796 if (!Fn) 2797 report_fatal_error( 2798 "llvm.eh.recoverfp must take a function as the first argument"); 2799 return IncomingFPOp; 2800 } 2801 } 2802 } 2803 2804 // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16. 2805 static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, 2806 EVT VT, EVT MemVT, 2807 SelectionDAG &DAG) { 2808 assert(VT.isVector() && "VT should be a vector type"); 2809 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16); 2810 2811 SDValue Value = ST->getValue(); 2812 2813 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract 2814 // the word lane which represent the v4i8 subvector. It optimizes the store 2815 // to: 2816 // 2817 // xtn v0.8b, v0.8h 2818 // str s0, [x0] 2819 2820 SDValue Undef = DAG.getUNDEF(MVT::i16); 2821 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL, 2822 {Undef, Undef, Undef, Undef}); 2823 2824 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, 2825 Value, UndefVec); 2826 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt); 2827 2828 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc); 2829 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, 2830 Trunc, DAG.getConstant(0, DL, MVT::i64)); 2831 2832 return DAG.getStore(ST->getChain(), DL, ExtractTrunc, 2833 ST->getBasePtr(), ST->getMemOperand()); 2834 } 2835 2836 // Custom lowering for any store, vector or scalar and/or default or with 2837 // a truncate operations. Currently only custom lower truncate operation 2838 // from vector v4i16 to v4i8. 2839 SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, 2840 SelectionDAG &DAG) const { 2841 SDLoc Dl(Op); 2842 StoreSDNode *StoreNode = cast<StoreSDNode>(Op); 2843 assert (StoreNode && "Can only custom lower store nodes"); 2844 2845 SDValue Value = StoreNode->getValue(); 2846 2847 EVT VT = Value.getValueType(); 2848 EVT MemVT = StoreNode->getMemoryVT(); 2849 2850 assert (VT.isVector() && "Can only custom lower vector store types"); 2851 2852 unsigned AS = StoreNode->getAddressSpace(); 2853 unsigned Align = StoreNode->getAlignment(); 2854 if (Align < MemVT.getStoreSize() && 2855 !allowsMisalignedMemoryAccesses( 2856 MemVT, AS, Align, StoreNode->getMemOperand()->getFlags(), nullptr)) { 2857 return scalarizeVectorStore(StoreNode, DAG); 2858 } 2859 2860 if (StoreNode->isTruncatingStore()) { 2861 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG); 2862 } 2863 2864 return SDValue(); 2865 } 2866 2867 SDValue AArch64TargetLowering::LowerOperation(SDValue Op, 2868 SelectionDAG &DAG) const { 2869 LLVM_DEBUG(dbgs() << "Custom lowering: "); 2870 LLVM_DEBUG(Op.dump()); 2871 2872 switch (Op.getOpcode()) { 2873 default: 2874 llvm_unreachable("unimplemented operand"); 2875 return SDValue(); 2876 case ISD::BITCAST: 2877 return LowerBITCAST(Op, DAG); 2878 case ISD::GlobalAddress: 2879 return LowerGlobalAddress(Op, DAG); 2880 case ISD::GlobalTLSAddress: 2881 return LowerGlobalTLSAddress(Op, DAG); 2882 case ISD::SETCC: 2883 return LowerSETCC(Op, DAG); 2884 case ISD::BR_CC: 2885 return LowerBR_CC(Op, DAG); 2886 case ISD::SELECT: 2887 return LowerSELECT(Op, DAG); 2888 case ISD::SELECT_CC: 2889 return LowerSELECT_CC(Op, DAG); 2890 case ISD::JumpTable: 2891 return LowerJumpTable(Op, DAG); 2892 case ISD::BR_JT: 2893 return LowerBR_JT(Op, DAG); 2894 case ISD::ConstantPool: 2895 return LowerConstantPool(Op, DAG); 2896 case ISD::BlockAddress: 2897 return LowerBlockAddress(Op, DAG); 2898 case ISD::VASTART: 2899 return LowerVASTART(Op, DAG); 2900 case ISD::VACOPY: 2901 return LowerVACOPY(Op, DAG); 2902 case ISD::VAARG: 2903 return LowerVAARG(Op, DAG); 2904 case ISD::ADDC: 2905 case ISD::ADDE: 2906 case ISD::SUBC: 2907 case ISD::SUBE: 2908 return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 2909 case ISD::SADDO: 2910 case ISD::UADDO: 2911 case ISD::SSUBO: 2912 case ISD::USUBO: 2913 case ISD::SMULO: 2914 case ISD::UMULO: 2915 return LowerXALUO(Op, DAG); 2916 case ISD::FADD: 2917 return LowerF128Call(Op, DAG, RTLIB::ADD_F128); 2918 case ISD::FSUB: 2919 return LowerF128Call(Op, DAG, RTLIB::SUB_F128); 2920 case ISD::FMUL: 2921 return LowerF128Call(Op, DAG, RTLIB::MUL_F128); 2922 case ISD::FDIV: 2923 return LowerF128Call(Op, DAG, RTLIB::DIV_F128); 2924 case ISD::FP_ROUND: 2925 return LowerFP_ROUND(Op, DAG); 2926 case ISD::FP_EXTEND: 2927 return LowerFP_EXTEND(Op, DAG); 2928 case ISD::FRAMEADDR: 2929 return LowerFRAMEADDR(Op, DAG); 2930 case ISD::SPONENTRY: 2931 return LowerSPONENTRY(Op, DAG); 2932 case ISD::RETURNADDR: 2933 return LowerRETURNADDR(Op, DAG); 2934 case ISD::ADDROFRETURNADDR: 2935 return LowerADDROFRETURNADDR(Op, DAG); 2936 case ISD::INSERT_VECTOR_ELT: 2937 return LowerINSERT_VECTOR_ELT(Op, DAG); 2938 case ISD::EXTRACT_VECTOR_ELT: 2939 return LowerEXTRACT_VECTOR_ELT(Op, DAG); 2940 case ISD::BUILD_VECTOR: 2941 return LowerBUILD_VECTOR(Op, DAG); 2942 case ISD::VECTOR_SHUFFLE: 2943 return LowerVECTOR_SHUFFLE(Op, DAG); 2944 case ISD::EXTRACT_SUBVECTOR: 2945 return LowerEXTRACT_SUBVECTOR(Op, DAG); 2946 case ISD::SRA: 2947 case ISD::SRL: 2948 case ISD::SHL: 2949 return LowerVectorSRA_SRL_SHL(Op, DAG); 2950 case ISD::SHL_PARTS: 2951 return LowerShiftLeftParts(Op, DAG); 2952 case ISD::SRL_PARTS: 2953 case ISD::SRA_PARTS: 2954 return LowerShiftRightParts(Op, DAG); 2955 case ISD::CTPOP: 2956 return LowerCTPOP(Op, DAG); 2957 case ISD::FCOPYSIGN: 2958 return LowerFCOPYSIGN(Op, DAG); 2959 case ISD::OR: 2960 return LowerVectorOR(Op, DAG); 2961 case ISD::XOR: 2962 return LowerXOR(Op, DAG); 2963 case ISD::PREFETCH: 2964 return LowerPREFETCH(Op, DAG); 2965 case ISD::SINT_TO_FP: 2966 case ISD::UINT_TO_FP: 2967 return LowerINT_TO_FP(Op, DAG); 2968 case ISD::FP_TO_SINT: 2969 case ISD::FP_TO_UINT: 2970 return LowerFP_TO_INT(Op, DAG); 2971 case ISD::FSINCOS: 2972 return LowerFSINCOS(Op, DAG); 2973 case ISD::FLT_ROUNDS_: 2974 return LowerFLT_ROUNDS_(Op, DAG); 2975 case ISD::MUL: 2976 return LowerMUL(Op, DAG); 2977 case ISD::INTRINSIC_WO_CHAIN: 2978 return LowerINTRINSIC_WO_CHAIN(Op, DAG); 2979 case ISD::STORE: 2980 return LowerSTORE(Op, DAG); 2981 case ISD::VECREDUCE_ADD: 2982 case ISD::VECREDUCE_SMAX: 2983 case ISD::VECREDUCE_SMIN: 2984 case ISD::VECREDUCE_UMAX: 2985 case ISD::VECREDUCE_UMIN: 2986 case ISD::VECREDUCE_FMAX: 2987 case ISD::VECREDUCE_FMIN: 2988 return LowerVECREDUCE(Op, DAG); 2989 case ISD::ATOMIC_LOAD_SUB: 2990 return LowerATOMIC_LOAD_SUB(Op, DAG); 2991 case ISD::ATOMIC_LOAD_AND: 2992 return LowerATOMIC_LOAD_AND(Op, DAG); 2993 case ISD::DYNAMIC_STACKALLOC: 2994 return LowerDYNAMIC_STACKALLOC(Op, DAG); 2995 } 2996 } 2997 2998 //===----------------------------------------------------------------------===// 2999 // Calling Convention Implementation 3000 //===----------------------------------------------------------------------===// 3001 3002 /// Selects the correct CCAssignFn for a given CallingConvention value. 3003 CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, 3004 bool IsVarArg) const { 3005 switch (CC) { 3006 default: 3007 report_fatal_error("Unsupported calling convention."); 3008 case CallingConv::WebKit_JS: 3009 return CC_AArch64_WebKit_JS; 3010 case CallingConv::GHC: 3011 return CC_AArch64_GHC; 3012 case CallingConv::C: 3013 case CallingConv::Fast: 3014 case CallingConv::PreserveMost: 3015 case CallingConv::CXX_FAST_TLS: 3016 case CallingConv::Swift: 3017 if (Subtarget->isTargetWindows() && IsVarArg) 3018 return CC_AArch64_Win64_VarArg; 3019 if (!Subtarget->isTargetDarwin()) 3020 return CC_AArch64_AAPCS; 3021 return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS; 3022 case CallingConv::Win64: 3023 return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS; 3024 case CallingConv::AArch64_VectorCall: 3025 return CC_AArch64_AAPCS; 3026 } 3027 } 3028 3029 CCAssignFn * 3030 AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const { 3031 return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS 3032 : RetCC_AArch64_AAPCS; 3033 } 3034 3035 SDValue AArch64TargetLowering::LowerFormalArguments( 3036 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3037 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, 3038 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3039 MachineFunction &MF = DAG.getMachineFunction(); 3040 MachineFrameInfo &MFI = MF.getFrameInfo(); 3041 bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()); 3042 3043 // Assign locations to all of the incoming arguments. 3044 SmallVector<CCValAssign, 16> ArgLocs; 3045 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 3046 *DAG.getContext()); 3047 3048 // At this point, Ins[].VT may already be promoted to i32. To correctly 3049 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and 3050 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. 3051 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here 3052 // we use a special version of AnalyzeFormalArguments to pass in ValVT and 3053 // LocVT. 3054 unsigned NumArgs = Ins.size(); 3055 Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin(); 3056 unsigned CurArgIdx = 0; 3057 for (unsigned i = 0; i != NumArgs; ++i) { 3058 MVT ValVT = Ins[i].VT; 3059 if (Ins[i].isOrigArg()) { 3060 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx); 3061 CurArgIdx = Ins[i].getOrigArgIndex(); 3062 3063 // Get type of the original argument. 3064 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(), 3065 /*AllowUnknown*/ true); 3066 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other; 3067 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. 3068 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) 3069 ValVT = MVT::i8; 3070 else if (ActualMVT == MVT::i16) 3071 ValVT = MVT::i16; 3072 } 3073 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); 3074 bool Res = 3075 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo); 3076 assert(!Res && "Call operand has unhandled type"); 3077 (void)Res; 3078 } 3079 assert(ArgLocs.size() == Ins.size()); 3080 SmallVector<SDValue, 16> ArgValues; 3081 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3082 CCValAssign &VA = ArgLocs[i]; 3083 3084 if (Ins[i].Flags.isByVal()) { 3085 // Byval is used for HFAs in the PCS, but the system should work in a 3086 // non-compliant manner for larger structs. 3087 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3088 int Size = Ins[i].Flags.getByValSize(); 3089 unsigned NumRegs = (Size + 7) / 8; 3090 3091 // FIXME: This works on big-endian for composite byvals, which are the common 3092 // case. It should also work for fundamental types too. 3093 unsigned FrameIdx = 3094 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false); 3095 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT); 3096 InVals.push_back(FrameIdxN); 3097 3098 continue; 3099 } 3100 3101 if (VA.isRegLoc()) { 3102 // Arguments stored in registers. 3103 EVT RegVT = VA.getLocVT(); 3104 3105 SDValue ArgValue; 3106 const TargetRegisterClass *RC; 3107 3108 if (RegVT == MVT::i32) 3109 RC = &AArch64::GPR32RegClass; 3110 else if (RegVT == MVT::i64) 3111 RC = &AArch64::GPR64RegClass; 3112 else if (RegVT == MVT::f16) 3113 RC = &AArch64::FPR16RegClass; 3114 else if (RegVT == MVT::f32) 3115 RC = &AArch64::FPR32RegClass; 3116 else if (RegVT == MVT::f64 || RegVT.is64BitVector()) 3117 RC = &AArch64::FPR64RegClass; 3118 else if (RegVT == MVT::f128 || RegVT.is128BitVector()) 3119 RC = &AArch64::FPR128RegClass; 3120 else 3121 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); 3122 3123 // Transform the arguments in physical registers into virtual ones. 3124 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 3125 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT); 3126 3127 // If this is an 8, 16 or 32-bit value, it is really passed promoted 3128 // to 64 bits. Insert an assert[sz]ext to capture this, then 3129 // truncate to the right size. 3130 switch (VA.getLocInfo()) { 3131 default: 3132 llvm_unreachable("Unknown loc info!"); 3133 case CCValAssign::Full: 3134 break; 3135 case CCValAssign::BCvt: 3136 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); 3137 break; 3138 case CCValAssign::AExt: 3139 case CCValAssign::SExt: 3140 case CCValAssign::ZExt: 3141 // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt 3142 // nodes after our lowering. 3143 assert(RegVT == Ins[i].VT && "incorrect register location selected"); 3144 break; 3145 } 3146 3147 InVals.push_back(ArgValue); 3148 3149 } else { // VA.isRegLoc() 3150 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); 3151 unsigned ArgOffset = VA.getLocMemOffset(); 3152 unsigned ArgSize = VA.getValVT().getSizeInBits() / 8; 3153 3154 uint32_t BEAlign = 0; 3155 if (!Subtarget->isLittleEndian() && ArgSize < 8 && 3156 !Ins[i].Flags.isInConsecutiveRegs()) 3157 BEAlign = 8 - ArgSize; 3158 3159 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); 3160 3161 // Create load nodes to retrieve arguments from the stack. 3162 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 3163 SDValue ArgValue; 3164 3165 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) 3166 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; 3167 MVT MemVT = VA.getValVT(); 3168 3169 switch (VA.getLocInfo()) { 3170 default: 3171 break; 3172 case CCValAssign::BCvt: 3173 MemVT = VA.getLocVT(); 3174 break; 3175 case CCValAssign::SExt: 3176 ExtType = ISD::SEXTLOAD; 3177 break; 3178 case CCValAssign::ZExt: 3179 ExtType = ISD::ZEXTLOAD; 3180 break; 3181 case CCValAssign::AExt: 3182 ExtType = ISD::EXTLOAD; 3183 break; 3184 } 3185 3186 ArgValue = DAG.getExtLoad( 3187 ExtType, DL, VA.getLocVT(), Chain, FIN, 3188 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), 3189 MemVT); 3190 3191 InVals.push_back(ArgValue); 3192 } 3193 } 3194 3195 // varargs 3196 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 3197 if (isVarArg) { 3198 if (!Subtarget->isTargetDarwin() || IsWin64) { 3199 // The AAPCS variadic function ABI is identical to the non-variadic 3200 // one. As a result there may be more arguments in registers and we should 3201 // save them for future reference. 3202 // Win64 variadic functions also pass arguments in registers, but all float 3203 // arguments are passed in integer registers. 3204 saveVarArgRegisters(CCInfo, DAG, DL, Chain); 3205 } 3206 3207 // This will point to the next argument passed via stack. 3208 unsigned StackOffset = CCInfo.getNextStackOffset(); 3209 // We currently pass all varargs at 8-byte alignment. 3210 StackOffset = ((StackOffset + 7) & ~7); 3211 FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true)); 3212 3213 if (MFI.hasMustTailInVarArgFunc()) { 3214 SmallVector<MVT, 2> RegParmTypes; 3215 RegParmTypes.push_back(MVT::i64); 3216 RegParmTypes.push_back(MVT::f128); 3217 // Compute the set of forwarded registers. The rest are scratch. 3218 SmallVectorImpl<ForwardedRegister> &Forwards = 3219 FuncInfo->getForwardedMustTailRegParms(); 3220 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, 3221 CC_AArch64_AAPCS); 3222 3223 // Conservatively forward X8, since it might be used for aggregate return. 3224 if (!CCInfo.isAllocated(AArch64::X8)) { 3225 unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass); 3226 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64)); 3227 } 3228 } 3229 } 3230 3231 // On Windows, InReg pointers must be returned, so record the pointer in a 3232 // virtual register at the start of the function so it can be returned in the 3233 // epilogue. 3234 if (IsWin64) { 3235 for (unsigned I = 0, E = Ins.size(); I != E; ++I) { 3236 if (Ins[I].Flags.isInReg()) { 3237 assert(!FuncInfo->getSRetReturnReg()); 3238 3239 MVT PtrTy = getPointerTy(DAG.getDataLayout()); 3240 unsigned Reg = 3241 MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); 3242 FuncInfo->setSRetReturnReg(Reg); 3243 3244 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]); 3245 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain); 3246 break; 3247 } 3248 } 3249 } 3250 3251 unsigned StackArgSize = CCInfo.getNextStackOffset(); 3252 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 3253 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) { 3254 // This is a non-standard ABI so by fiat I say we're allowed to make full 3255 // use of the stack area to be popped, which must be aligned to 16 bytes in 3256 // any case: 3257 StackArgSize = alignTo(StackArgSize, 16); 3258 3259 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding 3260 // a multiple of 16. 3261 FuncInfo->setArgumentStackToRestore(StackArgSize); 3262 3263 // This realignment carries over to the available bytes below. Our own 3264 // callers will guarantee the space is free by giving an aligned value to 3265 // CALLSEQ_START. 3266 } 3267 // Even if we're not expected to free up the space, it's useful to know how 3268 // much is there while considering tail calls (because we can reuse it). 3269 FuncInfo->setBytesInStackArgArea(StackArgSize); 3270 3271 if (Subtarget->hasCustomCallingConv()) 3272 Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF); 3273 3274 return Chain; 3275 } 3276 3277 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, 3278 SelectionDAG &DAG, 3279 const SDLoc &DL, 3280 SDValue &Chain) const { 3281 MachineFunction &MF = DAG.getMachineFunction(); 3282 MachineFrameInfo &MFI = MF.getFrameInfo(); 3283 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 3284 auto PtrVT = getPointerTy(DAG.getDataLayout()); 3285 bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()); 3286 3287 SmallVector<SDValue, 8> MemOps; 3288 3289 static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2, 3290 AArch64::X3, AArch64::X4, AArch64::X5, 3291 AArch64::X6, AArch64::X7 }; 3292 static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs); 3293 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs); 3294 3295 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); 3296 int GPRIdx = 0; 3297 if (GPRSaveSize != 0) { 3298 if (IsWin64) { 3299 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false); 3300 if (GPRSaveSize & 15) 3301 // The extra size here, if triggered, will always be 8. 3302 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false); 3303 } else 3304 GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false); 3305 3306 SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT); 3307 3308 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) { 3309 unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass); 3310 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); 3311 SDValue Store = DAG.getStore( 3312 Val.getValue(1), DL, Val, FIN, 3313 IsWin64 3314 ? MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), 3315 GPRIdx, 3316 (i - FirstVariadicGPR) * 8) 3317 : MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8)); 3318 MemOps.push_back(Store); 3319 FIN = 3320 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT)); 3321 } 3322 } 3323 FuncInfo->setVarArgsGPRIndex(GPRIdx); 3324 FuncInfo->setVarArgsGPRSize(GPRSaveSize); 3325 3326 if (Subtarget->hasFPARMv8() && !IsWin64) { 3327 static const MCPhysReg FPRArgRegs[] = { 3328 AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, 3329 AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7}; 3330 static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs); 3331 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs); 3332 3333 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); 3334 int FPRIdx = 0; 3335 if (FPRSaveSize != 0) { 3336 FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false); 3337 3338 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT); 3339 3340 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { 3341 unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass); 3342 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); 3343 3344 SDValue Store = DAG.getStore( 3345 Val.getValue(1), DL, Val, FIN, 3346 MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16)); 3347 MemOps.push_back(Store); 3348 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, 3349 DAG.getConstant(16, DL, PtrVT)); 3350 } 3351 } 3352 FuncInfo->setVarArgsFPRIndex(FPRIdx); 3353 FuncInfo->setVarArgsFPRSize(FPRSaveSize); 3354 } 3355 3356 if (!MemOps.empty()) { 3357 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); 3358 } 3359 } 3360 3361 /// LowerCallResult - Lower the result values of a call into the 3362 /// appropriate copies out of appropriate physical registers. 3363 SDValue AArch64TargetLowering::LowerCallResult( 3364 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 3365 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, 3366 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, 3367 SDValue ThisVal) const { 3368 CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS 3369 ? RetCC_AArch64_WebKit_JS 3370 : RetCC_AArch64_AAPCS; 3371 // Assign locations to each value returned by this call. 3372 SmallVector<CCValAssign, 16> RVLocs; 3373 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 3374 *DAG.getContext()); 3375 CCInfo.AnalyzeCallResult(Ins, RetCC); 3376 3377 // Copy all of the result registers out of their specified physreg. 3378 for (unsigned i = 0; i != RVLocs.size(); ++i) { 3379 CCValAssign VA = RVLocs[i]; 3380 3381 // Pass 'this' value directly from the argument to return value, to avoid 3382 // reg unit interference 3383 if (i == 0 && isThisReturn) { 3384 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 && 3385 "unexpected return calling convention register assignment"); 3386 InVals.push_back(ThisVal); 3387 continue; 3388 } 3389 3390 SDValue Val = 3391 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); 3392 Chain = Val.getValue(1); 3393 InFlag = Val.getValue(2); 3394 3395 switch (VA.getLocInfo()) { 3396 default: 3397 llvm_unreachable("Unknown loc info!"); 3398 case CCValAssign::Full: 3399 break; 3400 case CCValAssign::BCvt: 3401 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); 3402 break; 3403 } 3404 3405 InVals.push_back(Val); 3406 } 3407 3408 return Chain; 3409 } 3410 3411 /// Return true if the calling convention is one that we can guarantee TCO for. 3412 static bool canGuaranteeTCO(CallingConv::ID CC) { 3413 return CC == CallingConv::Fast; 3414 } 3415 3416 /// Return true if we might ever do TCO for calls with this calling convention. 3417 static bool mayTailCallThisCC(CallingConv::ID CC) { 3418 switch (CC) { 3419 case CallingConv::C: 3420 case CallingConv::PreserveMost: 3421 case CallingConv::Swift: 3422 return true; 3423 default: 3424 return canGuaranteeTCO(CC); 3425 } 3426 } 3427 3428 bool AArch64TargetLowering::isEligibleForTailCallOptimization( 3429 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, 3430 const SmallVectorImpl<ISD::OutputArg> &Outs, 3431 const SmallVectorImpl<SDValue> &OutVals, 3432 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { 3433 if (!mayTailCallThisCC(CalleeCC)) 3434 return false; 3435 3436 MachineFunction &MF = DAG.getMachineFunction(); 3437 const Function &CallerF = MF.getFunction(); 3438 CallingConv::ID CallerCC = CallerF.getCallingConv(); 3439 bool CCMatch = CallerCC == CalleeCC; 3440 3441 // Byval parameters hand the function a pointer directly into the stack area 3442 // we want to reuse during a tail call. Working around this *is* possible (see 3443 // X86) but less efficient and uglier in LowerCall. 3444 for (Function::const_arg_iterator i = CallerF.arg_begin(), 3445 e = CallerF.arg_end(); 3446 i != e; ++i) { 3447 if (i->hasByValAttr()) 3448 return false; 3449 3450 // On Windows, "inreg" attributes signify non-aggregate indirect returns. 3451 // In this case, it is necessary to save/restore X0 in the callee. Tail 3452 // call opt interferes with this. So we disable tail call opt when the 3453 // caller has an argument with "inreg" attribute. 3454 3455 // FIXME: Check whether the callee also has an "inreg" argument. 3456 if (i->hasInRegAttr()) 3457 return false; 3458 } 3459 3460 if (getTargetMachine().Options.GuaranteedTailCallOpt) 3461 return canGuaranteeTCO(CalleeCC) && CCMatch; 3462 3463 // Externally-defined functions with weak linkage should not be 3464 // tail-called on AArch64 when the OS does not support dynamic 3465 // pre-emption of symbols, as the AAELF spec requires normal calls 3466 // to undefined weak functions to be replaced with a NOP or jump to the 3467 // next instruction. The behaviour of branch instructions in this 3468 // situation (as used for tail calls) is implementation-defined, so we 3469 // cannot rely on the linker replacing the tail call with a return. 3470 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 3471 const GlobalValue *GV = G->getGlobal(); 3472 const Triple &TT = getTargetMachine().getTargetTriple(); 3473 if (GV->hasExternalWeakLinkage() && 3474 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) 3475 return false; 3476 } 3477 3478 // Now we search for cases where we can use a tail call without changing the 3479 // ABI. Sibcall is used in some places (particularly gcc) to refer to this 3480 // concept. 3481 3482 // I want anyone implementing a new calling convention to think long and hard 3483 // about this assert. 3484 assert((!isVarArg || CalleeCC == CallingConv::C) && 3485 "Unexpected variadic calling convention"); 3486 3487 LLVMContext &C = *DAG.getContext(); 3488 if (isVarArg && !Outs.empty()) { 3489 // At least two cases here: if caller is fastcc then we can't have any 3490 // memory arguments (we'd be expected to clean up the stack afterwards). If 3491 // caller is C then we could potentially use its argument area. 3492 3493 // FIXME: for now we take the most conservative of these in both cases: 3494 // disallow all variadic memory operands. 3495 SmallVector<CCValAssign, 16> ArgLocs; 3496 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); 3497 3498 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true)); 3499 for (const CCValAssign &ArgLoc : ArgLocs) 3500 if (!ArgLoc.isRegLoc()) 3501 return false; 3502 } 3503 3504 // Check that the call results are passed in the same way. 3505 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, 3506 CCAssignFnForCall(CalleeCC, isVarArg), 3507 CCAssignFnForCall(CallerCC, isVarArg))) 3508 return false; 3509 // The callee has to preserve all registers the caller needs to preserve. 3510 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 3511 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 3512 if (!CCMatch) { 3513 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 3514 if (Subtarget->hasCustomCallingConv()) { 3515 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved); 3516 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved); 3517 } 3518 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 3519 return false; 3520 } 3521 3522 // Nothing more to check if the callee is taking no arguments 3523 if (Outs.empty()) 3524 return true; 3525 3526 SmallVector<CCValAssign, 16> ArgLocs; 3527 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); 3528 3529 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); 3530 3531 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 3532 3533 // If the stack arguments for this call do not fit into our own save area then 3534 // the call cannot be made tail. 3535 if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) 3536 return false; 3537 3538 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3539 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) 3540 return false; 3541 3542 return true; 3543 } 3544 3545 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, 3546 SelectionDAG &DAG, 3547 MachineFrameInfo &MFI, 3548 int ClobberedFI) const { 3549 SmallVector<SDValue, 8> ArgChains; 3550 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI); 3551 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1; 3552 3553 // Include the original chain at the beginning of the list. When this is 3554 // used by target LowerCall hooks, this helps legalize find the 3555 // CALLSEQ_BEGIN node. 3556 ArgChains.push_back(Chain); 3557 3558 // Add a chain value for each stack argument corresponding 3559 for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(), 3560 UE = DAG.getEntryNode().getNode()->use_end(); 3561 U != UE; ++U) 3562 if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) 3563 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) 3564 if (FI->getIndex() < 0) { 3565 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex()); 3566 int64_t InLastByte = InFirstByte; 3567 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1; 3568 3569 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || 3570 (FirstByte <= InFirstByte && InFirstByte <= LastByte)) 3571 ArgChains.push_back(SDValue(L, 1)); 3572 } 3573 3574 // Build a tokenfactor for all the chains. 3575 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); 3576 } 3577 3578 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, 3579 bool TailCallOpt) const { 3580 return CallCC == CallingConv::Fast && TailCallOpt; 3581 } 3582 3583 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, 3584 /// and add input and output parameter nodes. 3585 SDValue 3586 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, 3587 SmallVectorImpl<SDValue> &InVals) const { 3588 SelectionDAG &DAG = CLI.DAG; 3589 SDLoc &DL = CLI.DL; 3590 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; 3591 SmallVector<SDValue, 32> &OutVals = CLI.OutVals; 3592 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; 3593 SDValue Chain = CLI.Chain; 3594 SDValue Callee = CLI.Callee; 3595 bool &IsTailCall = CLI.IsTailCall; 3596 CallingConv::ID CallConv = CLI.CallConv; 3597 bool IsVarArg = CLI.IsVarArg; 3598 3599 MachineFunction &MF = DAG.getMachineFunction(); 3600 bool IsThisReturn = false; 3601 3602 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 3603 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 3604 bool IsSibCall = false; 3605 3606 if (IsTailCall) { 3607 // Check if it's really possible to do a tail call. 3608 IsTailCall = isEligibleForTailCallOptimization( 3609 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); 3610 if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) 3611 report_fatal_error("failed to perform tail call elimination on a call " 3612 "site marked musttail"); 3613 3614 // A sibling call is one where we're under the usual C ABI and not planning 3615 // to change that but can still do a tail call: 3616 if (!TailCallOpt && IsTailCall) 3617 IsSibCall = true; 3618 3619 if (IsTailCall) 3620 ++NumTailCalls; 3621 } 3622 3623 // Analyze operands of the call, assigning locations to each operand. 3624 SmallVector<CCValAssign, 16> ArgLocs; 3625 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs, 3626 *DAG.getContext()); 3627 3628 if (IsVarArg) { 3629 // Handle fixed and variable vector arguments differently. 3630 // Variable vector arguments always go into memory. 3631 unsigned NumArgs = Outs.size(); 3632 3633 for (unsigned i = 0; i != NumArgs; ++i) { 3634 MVT ArgVT = Outs[i].VT; 3635 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; 3636 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, 3637 /*IsVarArg=*/ !Outs[i].IsFixed); 3638 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); 3639 assert(!Res && "Call operand has unhandled type"); 3640 (void)Res; 3641 } 3642 } else { 3643 // At this point, Outs[].VT may already be promoted to i32. To correctly 3644 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and 3645 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. 3646 // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here 3647 // we use a special version of AnalyzeCallOperands to pass in ValVT and 3648 // LocVT. 3649 unsigned NumArgs = Outs.size(); 3650 for (unsigned i = 0; i != NumArgs; ++i) { 3651 MVT ValVT = Outs[i].VT; 3652 // Get type of the original argument. 3653 EVT ActualVT = getValueType(DAG.getDataLayout(), 3654 CLI.getArgs()[Outs[i].OrigArgIndex].Ty, 3655 /*AllowUnknown*/ true); 3656 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT; 3657 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; 3658 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. 3659 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) 3660 ValVT = MVT::i8; 3661 else if (ActualMVT == MVT::i16) 3662 ValVT = MVT::i16; 3663 3664 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); 3665 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo); 3666 assert(!Res && "Call operand has unhandled type"); 3667 (void)Res; 3668 } 3669 } 3670 3671 // Get a count of how many bytes are to be pushed on the stack. 3672 unsigned NumBytes = CCInfo.getNextStackOffset(); 3673 3674 if (IsSibCall) { 3675 // Since we're not changing the ABI to make this a tail call, the memory 3676 // operands are already available in the caller's incoming argument space. 3677 NumBytes = 0; 3678 } 3679 3680 // FPDiff is the byte offset of the call's argument area from the callee's. 3681 // Stores to callee stack arguments will be placed in FixedStackSlots offset 3682 // by this amount for a tail call. In a sibling call it must be 0 because the 3683 // caller will deallocate the entire stack and the callee still expects its 3684 // arguments to begin at SP+0. Completely unused for non-tail calls. 3685 int FPDiff = 0; 3686 3687 if (IsTailCall && !IsSibCall) { 3688 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); 3689 3690 // Since callee will pop argument stack as a tail call, we must keep the 3691 // popped size 16-byte aligned. 3692 NumBytes = alignTo(NumBytes, 16); 3693 3694 // FPDiff will be negative if this tail call requires more space than we 3695 // would automatically have in our incoming argument space. Positive if we 3696 // can actually shrink the stack. 3697 FPDiff = NumReusableBytes - NumBytes; 3698 3699 // The stack pointer must be 16-byte aligned at all times it's used for a 3700 // memory operation, which in practice means at *all* times and in 3701 // particular across call boundaries. Therefore our own arguments started at 3702 // a 16-byte aligned SP and the delta applied for the tail call should 3703 // satisfy the same constraint. 3704 assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); 3705 } 3706 3707 // Adjust the stack pointer for the new arguments... 3708 // These operations are automatically eliminated by the prolog/epilog pass 3709 if (!IsSibCall) 3710 Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL); 3711 3712 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, 3713 getPointerTy(DAG.getDataLayout())); 3714 3715 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 3716 SmallVector<SDValue, 8> MemOpChains; 3717 auto PtrVT = getPointerTy(DAG.getDataLayout()); 3718 3719 if (IsVarArg && CLI.CS && CLI.CS.isMustTailCall()) { 3720 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms(); 3721 for (const auto &F : Forwards) { 3722 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT); 3723 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val)); 3724 } 3725 } 3726 3727 // Walk the register/memloc assignments, inserting copies/loads. 3728 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; 3729 ++i, ++realArgIdx) { 3730 CCValAssign &VA = ArgLocs[i]; 3731 SDValue Arg = OutVals[realArgIdx]; 3732 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 3733 3734 // Promote the value if needed. 3735 switch (VA.getLocInfo()) { 3736 default: 3737 llvm_unreachable("Unknown loc info!"); 3738 case CCValAssign::Full: 3739 break; 3740 case CCValAssign::SExt: 3741 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); 3742 break; 3743 case CCValAssign::ZExt: 3744 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 3745 break; 3746 case CCValAssign::AExt: 3747 if (Outs[realArgIdx].ArgVT == MVT::i1) { 3748 // AAPCS requires i1 to be zero-extended to 8-bits by the caller. 3749 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); 3750 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg); 3751 } 3752 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); 3753 break; 3754 case CCValAssign::BCvt: 3755 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 3756 break; 3757 case CCValAssign::FPExt: 3758 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); 3759 break; 3760 } 3761 3762 if (VA.isRegLoc()) { 3763 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && 3764 Outs[0].VT == MVT::i64) { 3765 assert(VA.getLocVT() == MVT::i64 && 3766 "unexpected calling convention register assignment"); 3767 assert(!Ins.empty() && Ins[0].VT == MVT::i64 && 3768 "unexpected use of 'returned'"); 3769 IsThisReturn = true; 3770 } 3771 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 3772 } else { 3773 assert(VA.isMemLoc()); 3774 3775 SDValue DstAddr; 3776 MachinePointerInfo DstInfo; 3777 3778 // FIXME: This works on big-endian for composite byvals, which are the 3779 // common case. It should also work for fundamental types too. 3780 uint32_t BEAlign = 0; 3781 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 3782 : VA.getValVT().getSizeInBits(); 3783 OpSize = (OpSize + 7) / 8; 3784 if (!Subtarget->isLittleEndian() && !Flags.isByVal() && 3785 !Flags.isInConsecutiveRegs()) { 3786 if (OpSize < 8) 3787 BEAlign = 8 - OpSize; 3788 } 3789 unsigned LocMemOffset = VA.getLocMemOffset(); 3790 int32_t Offset = LocMemOffset + BEAlign; 3791 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); 3792 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); 3793 3794 if (IsTailCall) { 3795 Offset = Offset + FPDiff; 3796 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); 3797 3798 DstAddr = DAG.getFrameIndex(FI, PtrVT); 3799 DstInfo = 3800 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); 3801 3802 // Make sure any stack arguments overlapping with where we're storing 3803 // are loaded before this eventual operation. Otherwise they'll be 3804 // clobbered. 3805 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI); 3806 } else { 3807 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); 3808 3809 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); 3810 DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(), 3811 LocMemOffset); 3812 } 3813 3814 if (Outs[i].Flags.isByVal()) { 3815 SDValue SizeNode = 3816 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64); 3817 SDValue Cpy = DAG.getMemcpy( 3818 Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(), 3819 /*isVol = */ false, /*AlwaysInline = */ false, 3820 /*isTailCall = */ false, 3821 DstInfo, MachinePointerInfo()); 3822 3823 MemOpChains.push_back(Cpy); 3824 } else { 3825 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already 3826 // promoted to a legal register type i32, we should truncate Arg back to 3827 // i1/i8/i16. 3828 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 || 3829 VA.getValVT() == MVT::i16) 3830 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg); 3831 3832 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo); 3833 MemOpChains.push_back(Store); 3834 } 3835 } 3836 } 3837 3838 if (!MemOpChains.empty()) 3839 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); 3840 3841 // Build a sequence of copy-to-reg nodes chained together with token chain 3842 // and flag operands which copy the outgoing args into the appropriate regs. 3843 SDValue InFlag; 3844 for (auto &RegToPass : RegsToPass) { 3845 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first, 3846 RegToPass.second, InFlag); 3847 InFlag = Chain.getValue(1); 3848 } 3849 3850 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every 3851 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 3852 // node so that legalize doesn't hack it. 3853 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 3854 auto GV = G->getGlobal(); 3855 if (Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()) == 3856 AArch64II::MO_GOT) { 3857 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT); 3858 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); 3859 } else if (Subtarget->isTargetCOFF() && GV->hasDLLImportStorageClass()) { 3860 assert(Subtarget->isTargetWindows() && 3861 "Windows is the only supported COFF target"); 3862 Callee = getGOT(G, DAG, AArch64II::MO_DLLIMPORT); 3863 } else { 3864 const GlobalValue *GV = G->getGlobal(); 3865 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); 3866 } 3867 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 3868 if (getTargetMachine().getCodeModel() == CodeModel::Large && 3869 Subtarget->isTargetMachO()) { 3870 const char *Sym = S->getSymbol(); 3871 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT); 3872 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); 3873 } else { 3874 const char *Sym = S->getSymbol(); 3875 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0); 3876 } 3877 } 3878 3879 // We don't usually want to end the call-sequence here because we would tidy 3880 // the frame up *after* the call, however in the ABI-changing tail-call case 3881 // we've carefully laid out the parameters so that when sp is reset they'll be 3882 // in the correct location. 3883 if (IsTailCall && !IsSibCall) { 3884 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), 3885 DAG.getIntPtrConstant(0, DL, true), InFlag, DL); 3886 InFlag = Chain.getValue(1); 3887 } 3888 3889 std::vector<SDValue> Ops; 3890 Ops.push_back(Chain); 3891 Ops.push_back(Callee); 3892 3893 if (IsTailCall) { 3894 // Each tail call may have to adjust the stack by a different amount, so 3895 // this information must travel along with the operation for eventual 3896 // consumption by emitEpilogue. 3897 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32)); 3898 } 3899 3900 // Add argument registers to the end of the list so that they are known live 3901 // into the call. 3902 for (auto &RegToPass : RegsToPass) 3903 Ops.push_back(DAG.getRegister(RegToPass.first, 3904 RegToPass.second.getValueType())); 3905 3906 // Add a register mask operand representing the call-preserved registers. 3907 const uint32_t *Mask; 3908 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 3909 if (IsThisReturn) { 3910 // For 'this' returns, use the X0-preserving mask if applicable 3911 Mask = TRI->getThisReturnPreservedMask(MF, CallConv); 3912 if (!Mask) { 3913 IsThisReturn = false; 3914 Mask = TRI->getCallPreservedMask(MF, CallConv); 3915 } 3916 } else 3917 Mask = TRI->getCallPreservedMask(MF, CallConv); 3918 3919 if (Subtarget->hasCustomCallingConv()) 3920 TRI->UpdateCustomCallPreservedMask(MF, &Mask); 3921 3922 if (TRI->isAnyArgRegReserved(MF)) 3923 TRI->emitReservedArgRegCallError(MF); 3924 3925 assert(Mask && "Missing call preserved mask for calling convention"); 3926 Ops.push_back(DAG.getRegisterMask(Mask)); 3927 3928 if (InFlag.getNode()) 3929 Ops.push_back(InFlag); 3930 3931 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 3932 3933 // If we're doing a tall call, use a TC_RETURN here rather than an 3934 // actual call instruction. 3935 if (IsTailCall) { 3936 MF.getFrameInfo().setHasTailCall(); 3937 return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops); 3938 } 3939 3940 // Returns a chain and a flag for retval copy to use. 3941 Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops); 3942 InFlag = Chain.getValue(1); 3943 3944 uint64_t CalleePopBytes = 3945 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0; 3946 3947 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), 3948 DAG.getIntPtrConstant(CalleePopBytes, DL, true), 3949 InFlag, DL); 3950 if (!Ins.empty()) 3951 InFlag = Chain.getValue(1); 3952 3953 // Handle result values, copying them out of physregs into vregs that we 3954 // return. 3955 return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG, 3956 InVals, IsThisReturn, 3957 IsThisReturn ? OutVals[0] : SDValue()); 3958 } 3959 3960 bool AArch64TargetLowering::CanLowerReturn( 3961 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, 3962 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { 3963 CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS 3964 ? RetCC_AArch64_WebKit_JS 3965 : RetCC_AArch64_AAPCS; 3966 SmallVector<CCValAssign, 16> RVLocs; 3967 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 3968 return CCInfo.CheckReturn(Outs, RetCC); 3969 } 3970 3971 SDValue 3972 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 3973 bool isVarArg, 3974 const SmallVectorImpl<ISD::OutputArg> &Outs, 3975 const SmallVectorImpl<SDValue> &OutVals, 3976 const SDLoc &DL, SelectionDAG &DAG) const { 3977 auto &MF = DAG.getMachineFunction(); 3978 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 3979 3980 CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS 3981 ? RetCC_AArch64_WebKit_JS 3982 : RetCC_AArch64_AAPCS; 3983 SmallVector<CCValAssign, 16> RVLocs; 3984 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 3985 *DAG.getContext()); 3986 CCInfo.AnalyzeReturn(Outs, RetCC); 3987 3988 // Copy the result values into the output registers. 3989 SDValue Flag; 3990 SmallVector<SDValue, 4> RetOps(1, Chain); 3991 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); 3992 ++i, ++realRVLocIdx) { 3993 CCValAssign &VA = RVLocs[i]; 3994 assert(VA.isRegLoc() && "Can only return in registers!"); 3995 SDValue Arg = OutVals[realRVLocIdx]; 3996 3997 switch (VA.getLocInfo()) { 3998 default: 3999 llvm_unreachable("Unknown loc info!"); 4000 case CCValAssign::Full: 4001 if (Outs[i].ArgVT == MVT::i1) { 4002 // AAPCS requires i1 to be zero-extended to i8 by the producer of the 4003 // value. This is strictly redundant on Darwin (which uses "zeroext 4004 // i1"), but will be optimised out before ISel. 4005 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); 4006 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 4007 } 4008 break; 4009 case CCValAssign::BCvt: 4010 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 4011 break; 4012 } 4013 4014 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); 4015 Flag = Chain.getValue(1); 4016 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 4017 } 4018 4019 // Windows AArch64 ABIs require that for returning structs by value we copy 4020 // the sret argument into X0 for the return. 4021 // We saved the argument into a virtual register in the entry block, 4022 // so now we copy the value out and into X0. 4023 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) { 4024 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg, 4025 getPointerTy(MF.getDataLayout())); 4026 4027 unsigned RetValReg = AArch64::X0; 4028 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Flag); 4029 Flag = Chain.getValue(1); 4030 4031 RetOps.push_back( 4032 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); 4033 } 4034 4035 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 4036 const MCPhysReg *I = 4037 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 4038 if (I) { 4039 for (; *I; ++I) { 4040 if (AArch64::GPR64RegClass.contains(*I)) 4041 RetOps.push_back(DAG.getRegister(*I, MVT::i64)); 4042 else if (AArch64::FPR64RegClass.contains(*I)) 4043 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); 4044 else 4045 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 4046 } 4047 } 4048 4049 RetOps[0] = Chain; // Update chain. 4050 4051 // Add the flag if we have it. 4052 if (Flag.getNode()) 4053 RetOps.push_back(Flag); 4054 4055 return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps); 4056 } 4057 4058 //===----------------------------------------------------------------------===// 4059 // Other Lowering Code 4060 //===----------------------------------------------------------------------===// 4061 4062 SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty, 4063 SelectionDAG &DAG, 4064 unsigned Flag) const { 4065 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, 4066 N->getOffset(), Flag); 4067 } 4068 4069 SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty, 4070 SelectionDAG &DAG, 4071 unsigned Flag) const { 4072 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag); 4073 } 4074 4075 SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty, 4076 SelectionDAG &DAG, 4077 unsigned Flag) const { 4078 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(), 4079 N->getOffset(), Flag); 4080 } 4081 4082 SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty, 4083 SelectionDAG &DAG, 4084 unsigned Flag) const { 4085 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag); 4086 } 4087 4088 // (loadGOT sym) 4089 template <class NodeTy> 4090 SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG, 4091 unsigned Flags) const { 4092 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n"); 4093 SDLoc DL(N); 4094 EVT Ty = getPointerTy(DAG.getDataLayout()); 4095 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags); 4096 // FIXME: Once remat is capable of dealing with instructions with register 4097 // operands, expand this into two nodes instead of using a wrapper node. 4098 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr); 4099 } 4100 4101 // (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym)) 4102 template <class NodeTy> 4103 SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG, 4104 unsigned Flags) const { 4105 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n"); 4106 SDLoc DL(N); 4107 EVT Ty = getPointerTy(DAG.getDataLayout()); 4108 const unsigned char MO_NC = AArch64II::MO_NC; 4109 return DAG.getNode( 4110 AArch64ISD::WrapperLarge, DL, Ty, 4111 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags), 4112 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags), 4113 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags), 4114 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags)); 4115 } 4116 4117 // (addlow (adrp %hi(sym)) %lo(sym)) 4118 template <class NodeTy> 4119 SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG, 4120 unsigned Flags) const { 4121 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n"); 4122 SDLoc DL(N); 4123 EVT Ty = getPointerTy(DAG.getDataLayout()); 4124 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags); 4125 SDValue Lo = getTargetNode(N, Ty, DAG, 4126 AArch64II::MO_PAGEOFF | AArch64II::MO_NC | Flags); 4127 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi); 4128 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo); 4129 } 4130 4131 // (adr sym) 4132 template <class NodeTy> 4133 SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG, 4134 unsigned Flags) const { 4135 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n"); 4136 SDLoc DL(N); 4137 EVT Ty = getPointerTy(DAG.getDataLayout()); 4138 SDValue Sym = getTargetNode(N, Ty, DAG, Flags); 4139 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym); 4140 } 4141 4142 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, 4143 SelectionDAG &DAG) const { 4144 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); 4145 const GlobalValue *GV = GN->getGlobal(); 4146 unsigned char OpFlags = 4147 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 4148 4149 if (OpFlags != AArch64II::MO_NO_FLAG) 4150 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 && 4151 "unexpected offset in global node"); 4152 4153 // This also catches the large code model case for Darwin, and tiny code 4154 // model with got relocations. 4155 if ((OpFlags & AArch64II::MO_GOT) != 0) { 4156 return getGOT(GN, DAG, OpFlags); 4157 } 4158 4159 SDValue Result; 4160 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 4161 Result = getAddrLarge(GN, DAG, OpFlags); 4162 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { 4163 Result = getAddrTiny(GN, DAG, OpFlags); 4164 } else { 4165 Result = getAddr(GN, DAG, OpFlags); 4166 } 4167 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 4168 SDLoc DL(GN); 4169 if (OpFlags & (AArch64II::MO_DLLIMPORT | AArch64II::MO_COFFSTUB)) 4170 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, 4171 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 4172 return Result; 4173 } 4174 4175 /// Convert a TLS address reference into the correct sequence of loads 4176 /// and calls to compute the variable's address (for Darwin, currently) and 4177 /// return an SDValue containing the final node. 4178 4179 /// Darwin only has one TLS scheme which must be capable of dealing with the 4180 /// fully general situation, in the worst case. This means: 4181 /// + "extern __thread" declaration. 4182 /// + Defined in a possibly unknown dynamic library. 4183 /// 4184 /// The general system is that each __thread variable has a [3 x i64] descriptor 4185 /// which contains information used by the runtime to calculate the address. The 4186 /// only part of this the compiler needs to know about is the first xword, which 4187 /// contains a function pointer that must be called with the address of the 4188 /// entire descriptor in "x0". 4189 /// 4190 /// Since this descriptor may be in a different unit, in general even the 4191 /// descriptor must be accessed via an indirect load. The "ideal" code sequence 4192 /// is: 4193 /// adrp x0, _var@TLVPPAGE 4194 /// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor 4195 /// ldr x1, [x0] ; x1 contains 1st entry of descriptor, 4196 /// ; the function pointer 4197 /// blr x1 ; Uses descriptor address in x0 4198 /// ; Address of _var is now in x0. 4199 /// 4200 /// If the address of _var's descriptor *is* known to the linker, then it can 4201 /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for 4202 /// a slight efficiency gain. 4203 SDValue 4204 AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, 4205 SelectionDAG &DAG) const { 4206 assert(Subtarget->isTargetDarwin() && 4207 "This function expects a Darwin target"); 4208 4209 SDLoc DL(Op); 4210 MVT PtrVT = getPointerTy(DAG.getDataLayout()); 4211 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 4212 4213 SDValue TLVPAddr = 4214 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 4215 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr); 4216 4217 // The first entry in the descriptor is a function pointer that we must call 4218 // to obtain the address of the variable. 4219 SDValue Chain = DAG.getEntryNode(); 4220 SDValue FuncTLVGet = DAG.getLoad( 4221 MVT::i64, DL, Chain, DescAddr, 4222 MachinePointerInfo::getGOT(DAG.getMachineFunction()), 4223 /* Alignment = */ 8, 4224 MachineMemOperand::MONonTemporal | MachineMemOperand::MOInvariant | 4225 MachineMemOperand::MODereferenceable); 4226 Chain = FuncTLVGet.getValue(1); 4227 4228 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 4229 MFI.setAdjustsStack(true); 4230 4231 // TLS calls preserve all registers except those that absolutely must be 4232 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be 4233 // silly). 4234 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 4235 const uint32_t *Mask = TRI->getTLSCallPreservedMask(); 4236 if (Subtarget->hasCustomCallingConv()) 4237 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask); 4238 4239 // Finally, we can make the call. This is just a degenerate version of a 4240 // normal AArch64 call node: x0 takes the address of the descriptor, and 4241 // returns the address of the variable in this thread. 4242 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue()); 4243 Chain = 4244 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), 4245 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64), 4246 DAG.getRegisterMask(Mask), Chain.getValue(1)); 4247 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1)); 4248 } 4249 4250 /// When accessing thread-local variables under either the general-dynamic or 4251 /// local-dynamic system, we make a "TLS-descriptor" call. The variable will 4252 /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry 4253 /// is a function pointer to carry out the resolution. 4254 /// 4255 /// The sequence is: 4256 /// adrp x0, :tlsdesc:var 4257 /// ldr x1, [x0, #:tlsdesc_lo12:var] 4258 /// add x0, x0, #:tlsdesc_lo12:var 4259 /// .tlsdesccall var 4260 /// blr x1 4261 /// (TPIDR_EL0 offset now in x0) 4262 /// 4263 /// The above sequence must be produced unscheduled, to enable the linker to 4264 /// optimize/relax this sequence. 4265 /// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the 4266 /// above sequence, and expanded really late in the compilation flow, to ensure 4267 /// the sequence is produced as per above. 4268 SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, 4269 const SDLoc &DL, 4270 SelectionDAG &DAG) const { 4271 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 4272 4273 SDValue Chain = DAG.getEntryNode(); 4274 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 4275 4276 Chain = 4277 DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr}); 4278 SDValue Glue = Chain.getValue(1); 4279 4280 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue); 4281 } 4282 4283 SDValue 4284 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, 4285 SelectionDAG &DAG) const { 4286 assert(Subtarget->isTargetELF() && "This function expects an ELF target"); 4287 if (getTargetMachine().getCodeModel() == CodeModel::Large) 4288 report_fatal_error("ELF TLS only supported in small memory model"); 4289 // Different choices can be made for the maximum size of the TLS area for a 4290 // module. For the small address model, the default TLS size is 16MiB and the 4291 // maximum TLS size is 4GiB. 4292 // FIXME: add -mtls-size command line option and make it control the 16MiB 4293 // vs. 4GiB code sequence generation. 4294 // FIXME: add tiny codemodel support. We currently generate the same code as 4295 // small, which may be larger than needed. 4296 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 4297 4298 TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); 4299 4300 if (!EnableAArch64ELFLocalDynamicTLSGeneration) { 4301 if (Model == TLSModel::LocalDynamic) 4302 Model = TLSModel::GeneralDynamic; 4303 } 4304 4305 SDValue TPOff; 4306 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 4307 SDLoc DL(Op); 4308 const GlobalValue *GV = GA->getGlobal(); 4309 4310 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); 4311 4312 if (Model == TLSModel::LocalExec) { 4313 SDValue HiVar = DAG.getTargetGlobalAddress( 4314 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); 4315 SDValue LoVar = DAG.getTargetGlobalAddress( 4316 GV, DL, PtrVT, 0, 4317 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 4318 4319 SDValue TPWithOff_lo = 4320 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, 4321 HiVar, 4322 DAG.getTargetConstant(0, DL, MVT::i32)), 4323 0); 4324 SDValue TPWithOff = 4325 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo, 4326 LoVar, 4327 DAG.getTargetConstant(0, DL, MVT::i32)), 4328 0); 4329 return TPWithOff; 4330 } else if (Model == TLSModel::InitialExec) { 4331 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 4332 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff); 4333 } else if (Model == TLSModel::LocalDynamic) { 4334 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS 4335 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate 4336 // the beginning of the module's TLS region, followed by a DTPREL offset 4337 // calculation. 4338 4339 // These accesses will need deduplicating if there's more than one. 4340 AArch64FunctionInfo *MFI = 4341 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); 4342 MFI->incNumLocalDynamicTLSAccesses(); 4343 4344 // The call needs a relocation too for linker relaxation. It doesn't make 4345 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of 4346 // the address. 4347 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, 4348 AArch64II::MO_TLS); 4349 4350 // Now we can calculate the offset from TPIDR_EL0 to this module's 4351 // thread-local area. 4352 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); 4353 4354 // Now use :dtprel_whatever: operations to calculate this variable's offset 4355 // in its thread-storage area. 4356 SDValue HiVar = DAG.getTargetGlobalAddress( 4357 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); 4358 SDValue LoVar = DAG.getTargetGlobalAddress( 4359 GV, DL, MVT::i64, 0, 4360 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 4361 4362 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar, 4363 DAG.getTargetConstant(0, DL, MVT::i32)), 4364 0); 4365 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar, 4366 DAG.getTargetConstant(0, DL, MVT::i32)), 4367 0); 4368 } else if (Model == TLSModel::GeneralDynamic) { 4369 // The call needs a relocation too for linker relaxation. It doesn't make 4370 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of 4371 // the address. 4372 SDValue SymAddr = 4373 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 4374 4375 // Finally we can make a call to calculate the offset from tpidr_el0. 4376 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); 4377 } else 4378 llvm_unreachable("Unsupported ELF TLS access model"); 4379 4380 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); 4381 } 4382 4383 SDValue 4384 AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op, 4385 SelectionDAG &DAG) const { 4386 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering"); 4387 4388 SDValue Chain = DAG.getEntryNode(); 4389 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 4390 SDLoc DL(Op); 4391 4392 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64); 4393 4394 // Load the ThreadLocalStoragePointer from the TEB 4395 // A pointer to the TLS array is located at offset 0x58 from the TEB. 4396 SDValue TLSArray = 4397 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL)); 4398 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo()); 4399 Chain = TLSArray.getValue(1); 4400 4401 // Load the TLS index from the C runtime; 4402 // This does the same as getAddr(), but without having a GlobalAddressSDNode. 4403 // This also does the same as LOADgot, but using a generic i32 load, 4404 // while LOADgot only loads i64. 4405 SDValue TLSIndexHi = 4406 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE); 4407 SDValue TLSIndexLo = DAG.getTargetExternalSymbol( 4408 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 4409 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi); 4410 SDValue TLSIndex = 4411 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo); 4412 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo()); 4413 Chain = TLSIndex.getValue(1); 4414 4415 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8 4416 // offset into the TLSArray. 4417 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex); 4418 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, 4419 DAG.getConstant(3, DL, PtrVT)); 4420 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain, 4421 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot), 4422 MachinePointerInfo()); 4423 Chain = TLS.getValue(1); 4424 4425 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 4426 const GlobalValue *GV = GA->getGlobal(); 4427 SDValue TGAHi = DAG.getTargetGlobalAddress( 4428 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); 4429 SDValue TGALo = DAG.getTargetGlobalAddress( 4430 GV, DL, PtrVT, 0, 4431 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 4432 4433 // Add the offset from the start of the .tls section (section base). 4434 SDValue Addr = 4435 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi, 4436 DAG.getTargetConstant(0, DL, MVT::i32)), 4437 0); 4438 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo); 4439 return Addr; 4440 } 4441 4442 SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, 4443 SelectionDAG &DAG) const { 4444 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 4445 if (DAG.getTarget().useEmulatedTLS()) 4446 return LowerToTLSEmulatedModel(GA, DAG); 4447 4448 if (Subtarget->isTargetDarwin()) 4449 return LowerDarwinGlobalTLSAddress(Op, DAG); 4450 if (Subtarget->isTargetELF()) 4451 return LowerELFGlobalTLSAddress(Op, DAG); 4452 if (Subtarget->isTargetWindows()) 4453 return LowerWindowsGlobalTLSAddress(Op, DAG); 4454 4455 llvm_unreachable("Unexpected platform trying to use TLS"); 4456 } 4457 4458 SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 4459 SDValue Chain = Op.getOperand(0); 4460 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 4461 SDValue LHS = Op.getOperand(2); 4462 SDValue RHS = Op.getOperand(3); 4463 SDValue Dest = Op.getOperand(4); 4464 SDLoc dl(Op); 4465 4466 MachineFunction &MF = DAG.getMachineFunction(); 4467 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions 4468 // will not be produced, as they are conditional branch instructions that do 4469 // not set flags. 4470 bool ProduceNonFlagSettingCondBr = 4471 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening); 4472 4473 // Handle f128 first, since lowering it will result in comparing the return 4474 // value of a libcall against zero, which is just what the rest of LowerBR_CC 4475 // is expecting to deal with. 4476 if (LHS.getValueType() == MVT::f128) { 4477 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 4478 4479 // If softenSetCCOperands returned a scalar, we need to compare the result 4480 // against zero to select between true and false values. 4481 if (!RHS.getNode()) { 4482 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 4483 CC = ISD::SETNE; 4484 } 4485 } 4486 4487 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 4488 // instruction. 4489 if (isOverflowIntrOpRes(LHS) && isOneConstant(RHS) && 4490 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 4491 // Only lower legal XALUO ops. 4492 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) 4493 return SDValue(); 4494 4495 // The actual operation with overflow check. 4496 AArch64CC::CondCode OFCC; 4497 SDValue Value, Overflow; 4498 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG); 4499 4500 if (CC == ISD::SETNE) 4501 OFCC = getInvertedCondCode(OFCC); 4502 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32); 4503 4504 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, 4505 Overflow); 4506 } 4507 4508 if (LHS.getValueType().isInteger()) { 4509 assert((LHS.getValueType() == RHS.getValueType()) && 4510 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); 4511 4512 // If the RHS of the comparison is zero, we can potentially fold this 4513 // to a specialized branch. 4514 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS); 4515 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) { 4516 if (CC == ISD::SETEQ) { 4517 // See if we can use a TBZ to fold in an AND as well. 4518 // TBZ has a smaller branch displacement than CBZ. If the offset is 4519 // out of bounds, a late MI-layer pass rewrites branches. 4520 // 403.gcc is an example that hits this case. 4521 if (LHS.getOpcode() == ISD::AND && 4522 isa<ConstantSDNode>(LHS.getOperand(1)) && 4523 isPowerOf2_64(LHS.getConstantOperandVal(1))) { 4524 SDValue Test = LHS.getOperand(0); 4525 uint64_t Mask = LHS.getConstantOperandVal(1); 4526 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test, 4527 DAG.getConstant(Log2_64(Mask), dl, MVT::i64), 4528 Dest); 4529 } 4530 4531 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest); 4532 } else if (CC == ISD::SETNE) { 4533 // See if we can use a TBZ to fold in an AND as well. 4534 // TBZ has a smaller branch displacement than CBZ. If the offset is 4535 // out of bounds, a late MI-layer pass rewrites branches. 4536 // 403.gcc is an example that hits this case. 4537 if (LHS.getOpcode() == ISD::AND && 4538 isa<ConstantSDNode>(LHS.getOperand(1)) && 4539 isPowerOf2_64(LHS.getConstantOperandVal(1))) { 4540 SDValue Test = LHS.getOperand(0); 4541 uint64_t Mask = LHS.getConstantOperandVal(1); 4542 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test, 4543 DAG.getConstant(Log2_64(Mask), dl, MVT::i64), 4544 Dest); 4545 } 4546 4547 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest); 4548 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) { 4549 // Don't combine AND since emitComparison converts the AND to an ANDS 4550 // (a.k.a. TST) and the test in the test bit and branch instruction 4551 // becomes redundant. This would also increase register pressure. 4552 uint64_t Mask = LHS.getValueSizeInBits() - 1; 4553 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS, 4554 DAG.getConstant(Mask, dl, MVT::i64), Dest); 4555 } 4556 } 4557 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT && 4558 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) { 4559 // Don't combine AND since emitComparison converts the AND to an ANDS 4560 // (a.k.a. TST) and the test in the test bit and branch instruction 4561 // becomes redundant. This would also increase register pressure. 4562 uint64_t Mask = LHS.getValueSizeInBits() - 1; 4563 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS, 4564 DAG.getConstant(Mask, dl, MVT::i64), Dest); 4565 } 4566 4567 SDValue CCVal; 4568 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 4569 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, 4570 Cmp); 4571 } 4572 4573 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || 4574 LHS.getValueType() == MVT::f64); 4575 4576 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 4577 // clean. Some of them require two branches to implement. 4578 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 4579 AArch64CC::CondCode CC1, CC2; 4580 changeFPCCToAArch64CC(CC, CC1, CC2); 4581 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 4582 SDValue BR1 = 4583 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp); 4584 if (CC2 != AArch64CC::AL) { 4585 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); 4586 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val, 4587 Cmp); 4588 } 4589 4590 return BR1; 4591 } 4592 4593 SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, 4594 SelectionDAG &DAG) const { 4595 EVT VT = Op.getValueType(); 4596 SDLoc DL(Op); 4597 4598 SDValue In1 = Op.getOperand(0); 4599 SDValue In2 = Op.getOperand(1); 4600 EVT SrcVT = In2.getValueType(); 4601 4602 if (SrcVT.bitsLT(VT)) 4603 In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); 4604 else if (SrcVT.bitsGT(VT)) 4605 In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL)); 4606 4607 EVT VecVT; 4608 uint64_t EltMask; 4609 SDValue VecVal1, VecVal2; 4610 4611 auto setVecVal = [&] (int Idx) { 4612 if (!VT.isVector()) { 4613 VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, 4614 DAG.getUNDEF(VecVT), In1); 4615 VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, 4616 DAG.getUNDEF(VecVT), In2); 4617 } else { 4618 VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); 4619 VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); 4620 } 4621 }; 4622 4623 if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { 4624 VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32); 4625 EltMask = 0x80000000ULL; 4626 setVecVal(AArch64::ssub); 4627 } else if (VT == MVT::f64 || VT == MVT::v2f64) { 4628 VecVT = MVT::v2i64; 4629 4630 // We want to materialize a mask with the high bit set, but the AdvSIMD 4631 // immediate moves cannot materialize that in a single instruction for 4632 // 64-bit elements. Instead, materialize zero and then negate it. 4633 EltMask = 0; 4634 4635 setVecVal(AArch64::dsub); 4636 } else if (VT == MVT::f16 || VT == MVT::v4f16 || VT == MVT::v8f16) { 4637 VecVT = (VT == MVT::v4f16 ? MVT::v4i16 : MVT::v8i16); 4638 EltMask = 0x8000ULL; 4639 setVecVal(AArch64::hsub); 4640 } else { 4641 llvm_unreachable("Invalid type for copysign!"); 4642 } 4643 4644 SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT); 4645 4646 // If we couldn't materialize the mask above, then the mask vector will be 4647 // the zero vector, and we need to negate it here. 4648 if (VT == MVT::f64 || VT == MVT::v2f64) { 4649 BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec); 4650 BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec); 4651 BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec); 4652 } 4653 4654 SDValue Sel = 4655 DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec); 4656 4657 if (VT == MVT::f16) 4658 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Sel); 4659 if (VT == MVT::f32) 4660 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel); 4661 else if (VT == MVT::f64) 4662 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel); 4663 else 4664 return DAG.getNode(ISD::BITCAST, DL, VT, Sel); 4665 } 4666 4667 SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { 4668 if (DAG.getMachineFunction().getFunction().hasFnAttribute( 4669 Attribute::NoImplicitFloat)) 4670 return SDValue(); 4671 4672 if (!Subtarget->hasNEON()) 4673 return SDValue(); 4674 4675 // While there is no integer popcount instruction, it can 4676 // be more efficiently lowered to the following sequence that uses 4677 // AdvSIMD registers/instructions as long as the copies to/from 4678 // the AdvSIMD registers are cheap. 4679 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd 4680 // CNT V0.8B, V0.8B // 8xbyte pop-counts 4681 // ADDV B0, V0.8B // sum 8xbyte pop-counts 4682 // UMOV X0, V0.B[0] // copy byte result back to integer reg 4683 SDValue Val = Op.getOperand(0); 4684 SDLoc DL(Op); 4685 EVT VT = Op.getValueType(); 4686 4687 if (VT == MVT::i32 || VT == MVT::i64) { 4688 if (VT == MVT::i32) 4689 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val); 4690 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val); 4691 4692 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val); 4693 SDValue UaddLV = DAG.getNode( 4694 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, 4695 DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); 4696 4697 if (VT == MVT::i64) 4698 UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); 4699 return UaddLV; 4700 } 4701 4702 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || 4703 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) && 4704 "Unexpected type for custom ctpop lowering"); 4705 4706 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; 4707 Val = DAG.getBitcast(VT8Bit, Val); 4708 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val); 4709 4710 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds. 4711 unsigned EltSize = 8; 4712 unsigned NumElts = VT.is64BitVector() ? 8 : 16; 4713 while (EltSize != VT.getScalarSizeInBits()) { 4714 EltSize *= 2; 4715 NumElts /= 2; 4716 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts); 4717 Val = DAG.getNode( 4718 ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, 4719 DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val); 4720 } 4721 4722 return Val; 4723 } 4724 4725 SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 4726 4727 if (Op.getValueType().isVector()) 4728 return LowerVSETCC(Op, DAG); 4729 4730 SDValue LHS = Op.getOperand(0); 4731 SDValue RHS = Op.getOperand(1); 4732 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 4733 SDLoc dl(Op); 4734 4735 // We chose ZeroOrOneBooleanContents, so use zero and one. 4736 EVT VT = Op.getValueType(); 4737 SDValue TVal = DAG.getConstant(1, dl, VT); 4738 SDValue FVal = DAG.getConstant(0, dl, VT); 4739 4740 // Handle f128 first, since one possible outcome is a normal integer 4741 // comparison which gets picked up by the next if statement. 4742 if (LHS.getValueType() == MVT::f128) { 4743 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 4744 4745 // If softenSetCCOperands returned a scalar, use it. 4746 if (!RHS.getNode()) { 4747 assert(LHS.getValueType() == Op.getValueType() && 4748 "Unexpected setcc expansion!"); 4749 return LHS; 4750 } 4751 } 4752 4753 if (LHS.getValueType().isInteger()) { 4754 SDValue CCVal; 4755 SDValue Cmp = 4756 getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl); 4757 4758 // Note that we inverted the condition above, so we reverse the order of 4759 // the true and false operands here. This will allow the setcc to be 4760 // matched to a single CSINC instruction. 4761 return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp); 4762 } 4763 4764 // Now we know we're dealing with FP values. 4765 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || 4766 LHS.getValueType() == MVT::f64); 4767 4768 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead 4769 // and do the comparison. 4770 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 4771 4772 AArch64CC::CondCode CC1, CC2; 4773 changeFPCCToAArch64CC(CC, CC1, CC2); 4774 if (CC2 == AArch64CC::AL) { 4775 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2); 4776 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 4777 4778 // Note that we inverted the condition above, so we reverse the order of 4779 // the true and false operands here. This will allow the setcc to be 4780 // matched to a single CSINC instruction. 4781 return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp); 4782 } else { 4783 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't 4784 // totally clean. Some of them require two CSELs to implement. As is in 4785 // this case, we emit the first CSEL and then emit a second using the output 4786 // of the first as the RHS. We're effectively OR'ing the two CC's together. 4787 4788 // FIXME: It would be nice if we could match the two CSELs to two CSINCs. 4789 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 4790 SDValue CS1 = 4791 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); 4792 4793 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); 4794 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); 4795 } 4796 } 4797 4798 SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, 4799 SDValue RHS, SDValue TVal, 4800 SDValue FVal, const SDLoc &dl, 4801 SelectionDAG &DAG) const { 4802 // Handle f128 first, because it will result in a comparison of some RTLIB 4803 // call result against zero. 4804 if (LHS.getValueType() == MVT::f128) { 4805 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 4806 4807 // If softenSetCCOperands returned a scalar, we need to compare the result 4808 // against zero to select between true and false values. 4809 if (!RHS.getNode()) { 4810 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 4811 CC = ISD::SETNE; 4812 } 4813 } 4814 4815 // Also handle f16, for which we need to do a f32 comparison. 4816 if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { 4817 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); 4818 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); 4819 } 4820 4821 // Next, handle integers. 4822 if (LHS.getValueType().isInteger()) { 4823 assert((LHS.getValueType() == RHS.getValueType()) && 4824 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); 4825 4826 unsigned Opcode = AArch64ISD::CSEL; 4827 4828 // If both the TVal and the FVal are constants, see if we can swap them in 4829 // order to for a CSINV or CSINC out of them. 4830 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); 4831 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); 4832 4833 if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) { 4834 std::swap(TVal, FVal); 4835 std::swap(CTVal, CFVal); 4836 CC = ISD::getSetCCInverse(CC, true); 4837 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) { 4838 std::swap(TVal, FVal); 4839 std::swap(CTVal, CFVal); 4840 CC = ISD::getSetCCInverse(CC, true); 4841 } else if (TVal.getOpcode() == ISD::XOR) { 4842 // If TVal is a NOT we want to swap TVal and FVal so that we can match 4843 // with a CSINV rather than a CSEL. 4844 if (isAllOnesConstant(TVal.getOperand(1))) { 4845 std::swap(TVal, FVal); 4846 std::swap(CTVal, CFVal); 4847 CC = ISD::getSetCCInverse(CC, true); 4848 } 4849 } else if (TVal.getOpcode() == ISD::SUB) { 4850 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so 4851 // that we can match with a CSNEG rather than a CSEL. 4852 if (isNullConstant(TVal.getOperand(0))) { 4853 std::swap(TVal, FVal); 4854 std::swap(CTVal, CFVal); 4855 CC = ISD::getSetCCInverse(CC, true); 4856 } 4857 } else if (CTVal && CFVal) { 4858 const int64_t TrueVal = CTVal->getSExtValue(); 4859 const int64_t FalseVal = CFVal->getSExtValue(); 4860 bool Swap = false; 4861 4862 // If both TVal and FVal are constants, see if FVal is the 4863 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC 4864 // instead of a CSEL in that case. 4865 if (TrueVal == ~FalseVal) { 4866 Opcode = AArch64ISD::CSINV; 4867 } else if (TrueVal == -FalseVal) { 4868 Opcode = AArch64ISD::CSNEG; 4869 } else if (TVal.getValueType() == MVT::i32) { 4870 // If our operands are only 32-bit wide, make sure we use 32-bit 4871 // arithmetic for the check whether we can use CSINC. This ensures that 4872 // the addition in the check will wrap around properly in case there is 4873 // an overflow (which would not be the case if we do the check with 4874 // 64-bit arithmetic). 4875 const uint32_t TrueVal32 = CTVal->getZExtValue(); 4876 const uint32_t FalseVal32 = CFVal->getZExtValue(); 4877 4878 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) { 4879 Opcode = AArch64ISD::CSINC; 4880 4881 if (TrueVal32 > FalseVal32) { 4882 Swap = true; 4883 } 4884 } 4885 // 64-bit check whether we can use CSINC. 4886 } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) { 4887 Opcode = AArch64ISD::CSINC; 4888 4889 if (TrueVal > FalseVal) { 4890 Swap = true; 4891 } 4892 } 4893 4894 // Swap TVal and FVal if necessary. 4895 if (Swap) { 4896 std::swap(TVal, FVal); 4897 std::swap(CTVal, CFVal); 4898 CC = ISD::getSetCCInverse(CC, true); 4899 } 4900 4901 if (Opcode != AArch64ISD::CSEL) { 4902 // Drop FVal since we can get its value by simply inverting/negating 4903 // TVal. 4904 FVal = TVal; 4905 } 4906 } 4907 4908 // Avoid materializing a constant when possible by reusing a known value in 4909 // a register. However, don't perform this optimization if the known value 4910 // is one, zero or negative one in the case of a CSEL. We can always 4911 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the 4912 // FVal, respectively. 4913 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS); 4914 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() && 4915 !RHSVal->isNullValue() && !RHSVal->isAllOnesValue()) { 4916 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); 4917 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to 4918 // "a != C ? x : a" to avoid materializing C. 4919 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ) 4920 TVal = LHS; 4921 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE) 4922 FVal = LHS; 4923 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) { 4924 assert (CTVal && CFVal && "Expected constant operands for CSNEG."); 4925 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to 4926 // avoid materializing C. 4927 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); 4928 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) { 4929 Opcode = AArch64ISD::CSINV; 4930 TVal = LHS; 4931 FVal = DAG.getConstant(0, dl, FVal.getValueType()); 4932 } 4933 } 4934 4935 SDValue CCVal; 4936 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 4937 EVT VT = TVal.getValueType(); 4938 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp); 4939 } 4940 4941 // Now we know we're dealing with FP values. 4942 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || 4943 LHS.getValueType() == MVT::f64); 4944 assert(LHS.getValueType() == RHS.getValueType()); 4945 EVT VT = TVal.getValueType(); 4946 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 4947 4948 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 4949 // clean. Some of them require two CSELs to implement. 4950 AArch64CC::CondCode CC1, CC2; 4951 changeFPCCToAArch64CC(CC, CC1, CC2); 4952 4953 if (DAG.getTarget().Options.UnsafeFPMath) { 4954 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and 4955 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0. 4956 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS); 4957 if (RHSVal && RHSVal->isZero()) { 4958 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal); 4959 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal); 4960 4961 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) && 4962 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType()) 4963 TVal = LHS; 4964 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) && 4965 CFVal && CFVal->isZero() && 4966 FVal.getValueType() == LHS.getValueType()) 4967 FVal = LHS; 4968 } 4969 } 4970 4971 // Emit first, and possibly only, CSEL. 4972 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); 4973 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); 4974 4975 // If we need a second CSEL, emit it, using the output of the first as the 4976 // RHS. We're effectively OR'ing the two CC's together. 4977 if (CC2 != AArch64CC::AL) { 4978 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); 4979 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); 4980 } 4981 4982 // Otherwise, return the output of the first CSEL. 4983 return CS1; 4984 } 4985 4986 SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, 4987 SelectionDAG &DAG) const { 4988 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 4989 SDValue LHS = Op.getOperand(0); 4990 SDValue RHS = Op.getOperand(1); 4991 SDValue TVal = Op.getOperand(2); 4992 SDValue FVal = Op.getOperand(3); 4993 SDLoc DL(Op); 4994 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); 4995 } 4996 4997 SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, 4998 SelectionDAG &DAG) const { 4999 SDValue CCVal = Op->getOperand(0); 5000 SDValue TVal = Op->getOperand(1); 5001 SDValue FVal = Op->getOperand(2); 5002 SDLoc DL(Op); 5003 5004 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select 5005 // instruction. 5006 if (isOverflowIntrOpRes(CCVal)) { 5007 // Only lower legal XALUO ops. 5008 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0))) 5009 return SDValue(); 5010 5011 AArch64CC::CondCode OFCC; 5012 SDValue Value, Overflow; 5013 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG); 5014 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32); 5015 5016 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, 5017 CCVal, Overflow); 5018 } 5019 5020 // Lower it the same way as we would lower a SELECT_CC node. 5021 ISD::CondCode CC; 5022 SDValue LHS, RHS; 5023 if (CCVal.getOpcode() == ISD::SETCC) { 5024 LHS = CCVal.getOperand(0); 5025 RHS = CCVal.getOperand(1); 5026 CC = cast<CondCodeSDNode>(CCVal->getOperand(2))->get(); 5027 } else { 5028 LHS = CCVal; 5029 RHS = DAG.getConstant(0, DL, CCVal.getValueType()); 5030 CC = ISD::SETNE; 5031 } 5032 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); 5033 } 5034 5035 SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, 5036 SelectionDAG &DAG) const { 5037 // Jump table entries as PC relative offsets. No additional tweaking 5038 // is necessary here. Just get the address of the jump table. 5039 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 5040 5041 if (getTargetMachine().getCodeModel() == CodeModel::Large && 5042 !Subtarget->isTargetMachO()) { 5043 return getAddrLarge(JT, DAG); 5044 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { 5045 return getAddrTiny(JT, DAG); 5046 } 5047 return getAddr(JT, DAG); 5048 } 5049 5050 SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op, 5051 SelectionDAG &DAG) const { 5052 // Jump table entries as PC relative offsets. No additional tweaking 5053 // is necessary here. Just get the address of the jump table. 5054 SDLoc DL(Op); 5055 SDValue JT = Op.getOperand(1); 5056 SDValue Entry = Op.getOperand(2); 5057 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex(); 5058 5059 SDNode *Dest = 5060 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT, 5061 Entry, DAG.getTargetJumpTable(JTI, MVT::i32)); 5062 return DAG.getNode(ISD::BRIND, DL, MVT::Other, Op.getOperand(0), 5063 SDValue(Dest, 0)); 5064 } 5065 5066 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op, 5067 SelectionDAG &DAG) const { 5068 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 5069 5070 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 5071 // Use the GOT for the large code model on iOS. 5072 if (Subtarget->isTargetMachO()) { 5073 return getGOT(CP, DAG); 5074 } 5075 return getAddrLarge(CP, DAG); 5076 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { 5077 return getAddrTiny(CP, DAG); 5078 } else { 5079 return getAddr(CP, DAG); 5080 } 5081 } 5082 5083 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op, 5084 SelectionDAG &DAG) const { 5085 BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op); 5086 if (getTargetMachine().getCodeModel() == CodeModel::Large && 5087 !Subtarget->isTargetMachO()) { 5088 return getAddrLarge(BA, DAG); 5089 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) { 5090 return getAddrTiny(BA, DAG); 5091 } 5092 return getAddr(BA, DAG); 5093 } 5094 5095 SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, 5096 SelectionDAG &DAG) const { 5097 AArch64FunctionInfo *FuncInfo = 5098 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); 5099 5100 SDLoc DL(Op); 5101 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), 5102 getPointerTy(DAG.getDataLayout())); 5103 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 5104 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 5105 MachinePointerInfo(SV)); 5106 } 5107 5108 SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op, 5109 SelectionDAG &DAG) const { 5110 AArch64FunctionInfo *FuncInfo = 5111 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); 5112 5113 SDLoc DL(Op); 5114 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0 5115 ? FuncInfo->getVarArgsGPRIndex() 5116 : FuncInfo->getVarArgsStackIndex(), 5117 getPointerTy(DAG.getDataLayout())); 5118 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 5119 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 5120 MachinePointerInfo(SV)); 5121 } 5122 5123 SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, 5124 SelectionDAG &DAG) const { 5125 // The layout of the va_list struct is specified in the AArch64 Procedure Call 5126 // Standard, section B.3. 5127 MachineFunction &MF = DAG.getMachineFunction(); 5128 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 5129 auto PtrVT = getPointerTy(DAG.getDataLayout()); 5130 SDLoc DL(Op); 5131 5132 SDValue Chain = Op.getOperand(0); 5133 SDValue VAList = Op.getOperand(1); 5134 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 5135 SmallVector<SDValue, 4> MemOps; 5136 5137 // void *__stack at offset 0 5138 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT); 5139 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, 5140 MachinePointerInfo(SV), /* Alignment = */ 8)); 5141 5142 // void *__gr_top at offset 8 5143 int GPRSize = FuncInfo->getVarArgsGPRSize(); 5144 if (GPRSize > 0) { 5145 SDValue GRTop, GRTopAddr; 5146 5147 GRTopAddr = 5148 DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT)); 5149 5150 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT); 5151 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop, 5152 DAG.getConstant(GPRSize, DL, PtrVT)); 5153 5154 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, 5155 MachinePointerInfo(SV, 8), 5156 /* Alignment = */ 8)); 5157 } 5158 5159 // void *__vr_top at offset 16 5160 int FPRSize = FuncInfo->getVarArgsFPRSize(); 5161 if (FPRSize > 0) { 5162 SDValue VRTop, VRTopAddr; 5163 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 5164 DAG.getConstant(16, DL, PtrVT)); 5165 5166 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT); 5167 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop, 5168 DAG.getConstant(FPRSize, DL, PtrVT)); 5169 5170 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, 5171 MachinePointerInfo(SV, 16), 5172 /* Alignment = */ 8)); 5173 } 5174 5175 // int __gr_offs at offset 24 5176 SDValue GROffsAddr = 5177 DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT)); 5178 MemOps.push_back(DAG.getStore( 5179 Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr, 5180 MachinePointerInfo(SV, 24), /* Alignment = */ 4)); 5181 5182 // int __vr_offs at offset 28 5183 SDValue VROffsAddr = 5184 DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT)); 5185 MemOps.push_back(DAG.getStore( 5186 Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr, 5187 MachinePointerInfo(SV, 28), /* Alignment = */ 4)); 5188 5189 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); 5190 } 5191 5192 SDValue AArch64TargetLowering::LowerVASTART(SDValue Op, 5193 SelectionDAG &DAG) const { 5194 MachineFunction &MF = DAG.getMachineFunction(); 5195 5196 if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv())) 5197 return LowerWin64_VASTART(Op, DAG); 5198 else if (Subtarget->isTargetDarwin()) 5199 return LowerDarwin_VASTART(Op, DAG); 5200 else 5201 return LowerAAPCS_VASTART(Op, DAG); 5202 } 5203 5204 SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, 5205 SelectionDAG &DAG) const { 5206 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single 5207 // pointer. 5208 SDLoc DL(Op); 5209 unsigned VaListSize = 5210 Subtarget->isTargetDarwin() || Subtarget->isTargetWindows() ? 8 : 32; 5211 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 5212 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 5213 5214 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), 5215 Op.getOperand(2), 5216 DAG.getConstant(VaListSize, DL, MVT::i32), 5217 8, false, false, false, MachinePointerInfo(DestSV), 5218 MachinePointerInfo(SrcSV)); 5219 } 5220 5221 SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 5222 assert(Subtarget->isTargetDarwin() && 5223 "automatic va_arg instruction only works on Darwin"); 5224 5225 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 5226 EVT VT = Op.getValueType(); 5227 SDLoc DL(Op); 5228 SDValue Chain = Op.getOperand(0); 5229 SDValue Addr = Op.getOperand(1); 5230 unsigned Align = Op.getConstantOperandVal(3); 5231 auto PtrVT = getPointerTy(DAG.getDataLayout()); 5232 5233 SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V)); 5234 Chain = VAList.getValue(1); 5235 5236 if (Align > 8) { 5237 assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2"); 5238 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 5239 DAG.getConstant(Align - 1, DL, PtrVT)); 5240 VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList, 5241 DAG.getConstant(-(int64_t)Align, DL, PtrVT)); 5242 } 5243 5244 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); 5245 uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); 5246 5247 // Scalar integer and FP values smaller than 64 bits are implicitly extended 5248 // up to 64 bits. At the very least, we have to increase the striding of the 5249 // vaargs list to match this, and for FP values we need to introduce 5250 // FP_ROUND nodes as well. 5251 if (VT.isInteger() && !VT.isVector()) 5252 ArgSize = 8; 5253 bool NeedFPTrunc = false; 5254 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) { 5255 ArgSize = 8; 5256 NeedFPTrunc = true; 5257 } 5258 5259 // Increment the pointer, VAList, to the next vaarg 5260 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, 5261 DAG.getConstant(ArgSize, DL, PtrVT)); 5262 // Store the incremented VAList to the legalized pointer 5263 SDValue APStore = 5264 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V)); 5265 5266 // Load the actual argument out of the pointer VAList 5267 if (NeedFPTrunc) { 5268 // Load the value as an f64. 5269 SDValue WideFP = 5270 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo()); 5271 // Round the value down to an f32. 5272 SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0), 5273 DAG.getIntPtrConstant(1, DL)); 5274 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) }; 5275 // Merge the rounded value with the chain output of the load. 5276 return DAG.getMergeValues(Ops, DL); 5277 } 5278 5279 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo()); 5280 } 5281 5282 SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, 5283 SelectionDAG &DAG) const { 5284 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 5285 MFI.setFrameAddressIsTaken(true); 5286 5287 EVT VT = Op.getValueType(); 5288 SDLoc DL(Op); 5289 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 5290 SDValue FrameAddr = 5291 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT); 5292 while (Depth--) 5293 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr, 5294 MachinePointerInfo()); 5295 return FrameAddr; 5296 } 5297 5298 SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op, 5299 SelectionDAG &DAG) const { 5300 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 5301 5302 EVT VT = getPointerTy(DAG.getDataLayout()); 5303 SDLoc DL(Op); 5304 int FI = MFI.CreateFixedObject(4, 0, false); 5305 return DAG.getFrameIndex(FI, VT); 5306 } 5307 5308 #define GET_REGISTER_MATCHER 5309 #include "AArch64GenAsmMatcher.inc" 5310 5311 // FIXME? Maybe this could be a TableGen attribute on some registers and 5312 // this table could be generated automatically from RegInfo. 5313 unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT, 5314 SelectionDAG &DAG) const { 5315 unsigned Reg = MatchRegisterName(RegName); 5316 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) { 5317 const MCRegisterInfo *MRI = Subtarget->getRegisterInfo(); 5318 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false); 5319 if (!Subtarget->isXRegisterReserved(DwarfRegNum)) 5320 Reg = 0; 5321 } 5322 if (Reg) 5323 return Reg; 5324 report_fatal_error(Twine("Invalid register name \"" 5325 + StringRef(RegName) + "\".")); 5326 } 5327 5328 SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op, 5329 SelectionDAG &DAG) const { 5330 DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true); 5331 5332 EVT VT = Op.getValueType(); 5333 SDLoc DL(Op); 5334 5335 SDValue FrameAddr = 5336 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT); 5337 SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); 5338 5339 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset); 5340 } 5341 5342 SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, 5343 SelectionDAG &DAG) const { 5344 MachineFunction &MF = DAG.getMachineFunction(); 5345 MachineFrameInfo &MFI = MF.getFrameInfo(); 5346 MFI.setReturnAddressIsTaken(true); 5347 5348 EVT VT = Op.getValueType(); 5349 SDLoc DL(Op); 5350 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 5351 if (Depth) { 5352 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 5353 SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); 5354 return DAG.getLoad(VT, DL, DAG.getEntryNode(), 5355 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), 5356 MachinePointerInfo()); 5357 } 5358 5359 // Return LR, which contains the return address. Mark it an implicit live-in. 5360 unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass); 5361 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); 5362 } 5363 5364 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two 5365 /// i64 values and take a 2 x i64 value to shift plus a shift amount. 5366 SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op, 5367 SelectionDAG &DAG) const { 5368 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5369 EVT VT = Op.getValueType(); 5370 unsigned VTBits = VT.getSizeInBits(); 5371 SDLoc dl(Op); 5372 SDValue ShOpLo = Op.getOperand(0); 5373 SDValue ShOpHi = Op.getOperand(1); 5374 SDValue ShAmt = Op.getOperand(2); 5375 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 5376 5377 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 5378 5379 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, 5380 DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); 5381 SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 5382 5383 // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which 5384 // is "undef". We wanted 0, so CSEL it directly. 5385 SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), 5386 ISD::SETEQ, dl, DAG); 5387 SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); 5388 HiBitsForLo = 5389 DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), 5390 HiBitsForLo, CCVal, Cmp); 5391 5392 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, 5393 DAG.getConstant(VTBits, dl, MVT::i64)); 5394 5395 SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 5396 SDValue LoForNormalShift = 5397 DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo); 5398 5399 Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, 5400 dl, DAG); 5401 CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); 5402 SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 5403 SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, 5404 LoForNormalShift, CCVal, Cmp); 5405 5406 // AArch64 shifts larger than the register width are wrapped rather than 5407 // clamped, so we can't just emit "hi >> x". 5408 SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 5409 SDValue HiForBigShift = 5410 Opc == ISD::SRA 5411 ? DAG.getNode(Opc, dl, VT, ShOpHi, 5412 DAG.getConstant(VTBits - 1, dl, MVT::i64)) 5413 : DAG.getConstant(0, dl, VT); 5414 SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, 5415 HiForNormalShift, CCVal, Cmp); 5416 5417 SDValue Ops[2] = { Lo, Hi }; 5418 return DAG.getMergeValues(Ops, dl); 5419 } 5420 5421 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two 5422 /// i64 values and take a 2 x i64 value to shift plus a shift amount. 5423 SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op, 5424 SelectionDAG &DAG) const { 5425 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5426 EVT VT = Op.getValueType(); 5427 unsigned VTBits = VT.getSizeInBits(); 5428 SDLoc dl(Op); 5429 SDValue ShOpLo = Op.getOperand(0); 5430 SDValue ShOpHi = Op.getOperand(1); 5431 SDValue ShAmt = Op.getOperand(2); 5432 5433 assert(Op.getOpcode() == ISD::SHL_PARTS); 5434 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, 5435 DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); 5436 SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 5437 5438 // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which 5439 // is "undef". We wanted 0, so CSEL it directly. 5440 SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), 5441 ISD::SETEQ, dl, DAG); 5442 SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); 5443 LoBitsForHi = 5444 DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), 5445 LoBitsForHi, CCVal, Cmp); 5446 5447 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, 5448 DAG.getConstant(VTBits, dl, MVT::i64)); 5449 SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 5450 SDValue HiForNormalShift = 5451 DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi); 5452 5453 SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 5454 5455 Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, 5456 dl, DAG); 5457 CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); 5458 SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, 5459 HiForNormalShift, CCVal, Cmp); 5460 5461 // AArch64 shifts of larger than register sizes are wrapped rather than 5462 // clamped, so we can't just emit "lo << a" if a is too big. 5463 SDValue LoForBigShift = DAG.getConstant(0, dl, VT); 5464 SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 5465 SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, 5466 LoForNormalShift, CCVal, Cmp); 5467 5468 SDValue Ops[2] = { Lo, Hi }; 5469 return DAG.getMergeValues(Ops, dl); 5470 } 5471 5472 bool AArch64TargetLowering::isOffsetFoldingLegal( 5473 const GlobalAddressSDNode *GA) const { 5474 // Offsets are folded in the DAG combine rather than here so that we can 5475 // intelligently choose an offset based on the uses. 5476 return false; 5477 } 5478 5479 bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, 5480 bool OptForSize) const { 5481 bool IsLegal = false; 5482 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and 5483 // 16-bit case when target has full fp16 support. 5484 // FIXME: We should be able to handle f128 as well with a clever lowering. 5485 const APInt ImmInt = Imm.bitcastToAPInt(); 5486 if (VT == MVT::f64) 5487 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero(); 5488 else if (VT == MVT::f32) 5489 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero(); 5490 else if (VT == MVT::f16 && Subtarget->hasFullFP16()) 5491 IsLegal = AArch64_AM::getFP16Imm(ImmInt) != -1 || Imm.isPosZero(); 5492 // TODO: fmov h0, w0 is also legal, however on't have an isel pattern to 5493 // generate that fmov. 5494 5495 // If we can not materialize in immediate field for fmov, check if the 5496 // value can be encoded as the immediate operand of a logical instruction. 5497 // The immediate value will be created with either MOVZ, MOVN, or ORR. 5498 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) { 5499 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr; 5500 // however the mov+fmov sequence is always better because of the reduced 5501 // cache pressure. The timings are still the same if you consider 5502 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the 5503 // movw+movk is fused). So we limit up to 2 instrdduction at most. 5504 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 5505 AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(), 5506 Insn); 5507 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2)); 5508 IsLegal = Insn.size() <= Limit; 5509 } 5510 5511 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT.getEVTString() 5512 << " imm value: "; Imm.dump();); 5513 return IsLegal; 5514 } 5515 5516 //===----------------------------------------------------------------------===// 5517 // AArch64 Optimization Hooks 5518 //===----------------------------------------------------------------------===// 5519 5520 static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, 5521 SDValue Operand, SelectionDAG &DAG, 5522 int &ExtraSteps) { 5523 EVT VT = Operand.getValueType(); 5524 if (ST->hasNEON() && 5525 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 || 5526 VT == MVT::f32 || VT == MVT::v1f32 || 5527 VT == MVT::v2f32 || VT == MVT::v4f32)) { 5528 if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified) 5529 // For the reciprocal estimates, convergence is quadratic, so the number 5530 // of digits is doubled after each iteration. In ARMv8, the accuracy of 5531 // the initial estimate is 2^-8. Thus the number of extra steps to refine 5532 // the result for float (23 mantissa bits) is 2 and for double (52 5533 // mantissa bits) is 3. 5534 ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2; 5535 5536 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand); 5537 } 5538 5539 return SDValue(); 5540 } 5541 5542 SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand, 5543 SelectionDAG &DAG, int Enabled, 5544 int &ExtraSteps, 5545 bool &UseOneConst, 5546 bool Reciprocal) const { 5547 if (Enabled == ReciprocalEstimate::Enabled || 5548 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt())) 5549 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand, 5550 DAG, ExtraSteps)) { 5551 SDLoc DL(Operand); 5552 EVT VT = Operand.getValueType(); 5553 5554 SDNodeFlags Flags; 5555 Flags.setAllowReassociation(true); 5556 5557 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2) 5558 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N) 5559 for (int i = ExtraSteps; i > 0; --i) { 5560 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate, 5561 Flags); 5562 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags); 5563 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags); 5564 } 5565 if (!Reciprocal) { 5566 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), 5567 VT); 5568 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); 5569 SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ); 5570 5571 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags); 5572 // Correct the result if the operand is 0.0. 5573 Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, 5574 VT, Eq, Operand, Estimate); 5575 } 5576 5577 ExtraSteps = 0; 5578 return Estimate; 5579 } 5580 5581 return SDValue(); 5582 } 5583 5584 SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand, 5585 SelectionDAG &DAG, int Enabled, 5586 int &ExtraSteps) const { 5587 if (Enabled == ReciprocalEstimate::Enabled) 5588 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand, 5589 DAG, ExtraSteps)) { 5590 SDLoc DL(Operand); 5591 EVT VT = Operand.getValueType(); 5592 5593 SDNodeFlags Flags; 5594 Flags.setAllowReassociation(true); 5595 5596 // Newton reciprocal iteration: E * (2 - X * E) 5597 // AArch64 reciprocal iteration instruction: (2 - M * N) 5598 for (int i = ExtraSteps; i > 0; --i) { 5599 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand, 5600 Estimate, Flags); 5601 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags); 5602 } 5603 5604 ExtraSteps = 0; 5605 return Estimate; 5606 } 5607 5608 return SDValue(); 5609 } 5610 5611 //===----------------------------------------------------------------------===// 5612 // AArch64 Inline Assembly Support 5613 //===----------------------------------------------------------------------===// 5614 5615 // Table of Constraints 5616 // TODO: This is the current set of constraints supported by ARM for the 5617 // compiler, not all of them may make sense. 5618 // 5619 // r - A general register 5620 // w - An FP/SIMD register of some size in the range v0-v31 5621 // x - An FP/SIMD register of some size in the range v0-v15 5622 // I - Constant that can be used with an ADD instruction 5623 // J - Constant that can be used with a SUB instruction 5624 // K - Constant that can be used with a 32-bit logical instruction 5625 // L - Constant that can be used with a 64-bit logical instruction 5626 // M - Constant that can be used as a 32-bit MOV immediate 5627 // N - Constant that can be used as a 64-bit MOV immediate 5628 // Q - A memory reference with base register and no offset 5629 // S - A symbolic address 5630 // Y - Floating point constant zero 5631 // Z - Integer constant zero 5632 // 5633 // Note that general register operands will be output using their 64-bit x 5634 // register name, whatever the size of the variable, unless the asm operand 5635 // is prefixed by the %w modifier. Floating-point and SIMD register operands 5636 // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or 5637 // %q modifier. 5638 const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const { 5639 // At this point, we have to lower this constraint to something else, so we 5640 // lower it to an "r" or "w". However, by doing this we will force the result 5641 // to be in register, while the X constraint is much more permissive. 5642 // 5643 // Although we are correct (we are free to emit anything, without 5644 // constraints), we might break use cases that would expect us to be more 5645 // efficient and emit something else. 5646 if (!Subtarget->hasFPARMv8()) 5647 return "r"; 5648 5649 if (ConstraintVT.isFloatingPoint()) 5650 return "w"; 5651 5652 if (ConstraintVT.isVector() && 5653 (ConstraintVT.getSizeInBits() == 64 || 5654 ConstraintVT.getSizeInBits() == 128)) 5655 return "w"; 5656 5657 return "r"; 5658 } 5659 5660 /// getConstraintType - Given a constraint letter, return the type of 5661 /// constraint it is for this target. 5662 AArch64TargetLowering::ConstraintType 5663 AArch64TargetLowering::getConstraintType(StringRef Constraint) const { 5664 if (Constraint.size() == 1) { 5665 switch (Constraint[0]) { 5666 default: 5667 break; 5668 case 'x': 5669 case 'w': 5670 return C_RegisterClass; 5671 // An address with a single base register. Due to the way we 5672 // currently handle addresses it is the same as 'r'. 5673 case 'Q': 5674 return C_Memory; 5675 case 'I': 5676 case 'J': 5677 case 'K': 5678 case 'L': 5679 case 'M': 5680 case 'N': 5681 case 'Y': 5682 case 'Z': 5683 return C_Immediate; 5684 case 'z': 5685 case 'S': // A symbolic address 5686 return C_Other; 5687 } 5688 } 5689 return TargetLowering::getConstraintType(Constraint); 5690 } 5691 5692 /// Examine constraint type and operand type and determine a weight value. 5693 /// This object must already have been set up with the operand type 5694 /// and the current alternative constraint selected. 5695 TargetLowering::ConstraintWeight 5696 AArch64TargetLowering::getSingleConstraintMatchWeight( 5697 AsmOperandInfo &info, const char *constraint) const { 5698 ConstraintWeight weight = CW_Invalid; 5699 Value *CallOperandVal = info.CallOperandVal; 5700 // If we don't have a value, we can't do a match, 5701 // but allow it at the lowest weight. 5702 if (!CallOperandVal) 5703 return CW_Default; 5704 Type *type = CallOperandVal->getType(); 5705 // Look at the constraint type. 5706 switch (*constraint) { 5707 default: 5708 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 5709 break; 5710 case 'x': 5711 case 'w': 5712 if (type->isFloatingPointTy() || type->isVectorTy()) 5713 weight = CW_Register; 5714 break; 5715 case 'z': 5716 weight = CW_Constant; 5717 break; 5718 } 5719 return weight; 5720 } 5721 5722 std::pair<unsigned, const TargetRegisterClass *> 5723 AArch64TargetLowering::getRegForInlineAsmConstraint( 5724 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { 5725 if (Constraint.size() == 1) { 5726 switch (Constraint[0]) { 5727 case 'r': 5728 if (VT.getSizeInBits() == 64) 5729 return std::make_pair(0U, &AArch64::GPR64commonRegClass); 5730 return std::make_pair(0U, &AArch64::GPR32commonRegClass); 5731 case 'w': 5732 if (!Subtarget->hasFPARMv8()) 5733 break; 5734 if (VT.getSizeInBits() == 16) 5735 return std::make_pair(0U, &AArch64::FPR16RegClass); 5736 if (VT.getSizeInBits() == 32) 5737 return std::make_pair(0U, &AArch64::FPR32RegClass); 5738 if (VT.getSizeInBits() == 64) 5739 return std::make_pair(0U, &AArch64::FPR64RegClass); 5740 if (VT.getSizeInBits() == 128) 5741 return std::make_pair(0U, &AArch64::FPR128RegClass); 5742 break; 5743 // The instructions that this constraint is designed for can 5744 // only take 128-bit registers so just use that regclass. 5745 case 'x': 5746 if (!Subtarget->hasFPARMv8()) 5747 break; 5748 if (VT.getSizeInBits() == 128) 5749 return std::make_pair(0U, &AArch64::FPR128_loRegClass); 5750 break; 5751 } 5752 } 5753 if (StringRef("{cc}").equals_lower(Constraint)) 5754 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass); 5755 5756 // Use the default implementation in TargetLowering to convert the register 5757 // constraint into a member of a register class. 5758 std::pair<unsigned, const TargetRegisterClass *> Res; 5759 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 5760 5761 // Not found as a standard register? 5762 if (!Res.second) { 5763 unsigned Size = Constraint.size(); 5764 if ((Size == 4 || Size == 5) && Constraint[0] == '{' && 5765 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') { 5766 int RegNo; 5767 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo); 5768 if (!Failed && RegNo >= 0 && RegNo <= 31) { 5769 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size. 5770 // By default we'll emit v0-v31 for this unless there's a modifier where 5771 // we'll emit the correct register as well. 5772 if (VT != MVT::Other && VT.getSizeInBits() == 64) { 5773 Res.first = AArch64::FPR64RegClass.getRegister(RegNo); 5774 Res.second = &AArch64::FPR64RegClass; 5775 } else { 5776 Res.first = AArch64::FPR128RegClass.getRegister(RegNo); 5777 Res.second = &AArch64::FPR128RegClass; 5778 } 5779 } 5780 } 5781 } 5782 5783 if (Res.second && !Subtarget->hasFPARMv8() && 5784 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) && 5785 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second)) 5786 return std::make_pair(0U, nullptr); 5787 5788 return Res; 5789 } 5790 5791 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 5792 /// vector. If it is invalid, don't add anything to Ops. 5793 void AArch64TargetLowering::LowerAsmOperandForConstraint( 5794 SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, 5795 SelectionDAG &DAG) const { 5796 SDValue Result; 5797 5798 // Currently only support length 1 constraints. 5799 if (Constraint.length() != 1) 5800 return; 5801 5802 char ConstraintLetter = Constraint[0]; 5803 switch (ConstraintLetter) { 5804 default: 5805 break; 5806 5807 // This set of constraints deal with valid constants for various instructions. 5808 // Validate and return a target constant for them if we can. 5809 case 'z': { 5810 // 'z' maps to xzr or wzr so it needs an input of 0. 5811 if (!isNullConstant(Op)) 5812 return; 5813 5814 if (Op.getValueType() == MVT::i64) 5815 Result = DAG.getRegister(AArch64::XZR, MVT::i64); 5816 else 5817 Result = DAG.getRegister(AArch64::WZR, MVT::i32); 5818 break; 5819 } 5820 case 'S': { 5821 // An absolute symbolic address or label reference. 5822 if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) { 5823 Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op), 5824 GA->getValueType(0)); 5825 } else if (const BlockAddressSDNode *BA = 5826 dyn_cast<BlockAddressSDNode>(Op)) { 5827 Result = 5828 DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0)); 5829 } else if (const ExternalSymbolSDNode *ES = 5830 dyn_cast<ExternalSymbolSDNode>(Op)) { 5831 Result = 5832 DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0)); 5833 } else 5834 return; 5835 break; 5836 } 5837 5838 case 'I': 5839 case 'J': 5840 case 'K': 5841 case 'L': 5842 case 'M': 5843 case 'N': 5844 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 5845 if (!C) 5846 return; 5847 5848 // Grab the value and do some validation. 5849 uint64_t CVal = C->getZExtValue(); 5850 switch (ConstraintLetter) { 5851 // The I constraint applies only to simple ADD or SUB immediate operands: 5852 // i.e. 0 to 4095 with optional shift by 12 5853 // The J constraint applies only to ADD or SUB immediates that would be 5854 // valid when negated, i.e. if [an add pattern] were to be output as a SUB 5855 // instruction [or vice versa], in other words -1 to -4095 with optional 5856 // left shift by 12. 5857 case 'I': 5858 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal)) 5859 break; 5860 return; 5861 case 'J': { 5862 uint64_t NVal = -C->getSExtValue(); 5863 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) { 5864 CVal = C->getSExtValue(); 5865 break; 5866 } 5867 return; 5868 } 5869 // The K and L constraints apply *only* to logical immediates, including 5870 // what used to be the MOVI alias for ORR (though the MOVI alias has now 5871 // been removed and MOV should be used). So these constraints have to 5872 // distinguish between bit patterns that are valid 32-bit or 64-bit 5873 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but 5874 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice 5875 // versa. 5876 case 'K': 5877 if (AArch64_AM::isLogicalImmediate(CVal, 32)) 5878 break; 5879 return; 5880 case 'L': 5881 if (AArch64_AM::isLogicalImmediate(CVal, 64)) 5882 break; 5883 return; 5884 // The M and N constraints are a superset of K and L respectively, for use 5885 // with the MOV (immediate) alias. As well as the logical immediates they 5886 // also match 32 or 64-bit immediates that can be loaded either using a 5887 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca 5888 // (M) or 64-bit 0x1234000000000000 (N) etc. 5889 // As a note some of this code is liberally stolen from the asm parser. 5890 case 'M': { 5891 if (!isUInt<32>(CVal)) 5892 return; 5893 if (AArch64_AM::isLogicalImmediate(CVal, 32)) 5894 break; 5895 if ((CVal & 0xFFFF) == CVal) 5896 break; 5897 if ((CVal & 0xFFFF0000ULL) == CVal) 5898 break; 5899 uint64_t NCVal = ~(uint32_t)CVal; 5900 if ((NCVal & 0xFFFFULL) == NCVal) 5901 break; 5902 if ((NCVal & 0xFFFF0000ULL) == NCVal) 5903 break; 5904 return; 5905 } 5906 case 'N': { 5907 if (AArch64_AM::isLogicalImmediate(CVal, 64)) 5908 break; 5909 if ((CVal & 0xFFFFULL) == CVal) 5910 break; 5911 if ((CVal & 0xFFFF0000ULL) == CVal) 5912 break; 5913 if ((CVal & 0xFFFF00000000ULL) == CVal) 5914 break; 5915 if ((CVal & 0xFFFF000000000000ULL) == CVal) 5916 break; 5917 uint64_t NCVal = ~CVal; 5918 if ((NCVal & 0xFFFFULL) == NCVal) 5919 break; 5920 if ((NCVal & 0xFFFF0000ULL) == NCVal) 5921 break; 5922 if ((NCVal & 0xFFFF00000000ULL) == NCVal) 5923 break; 5924 if ((NCVal & 0xFFFF000000000000ULL) == NCVal) 5925 break; 5926 return; 5927 } 5928 default: 5929 return; 5930 } 5931 5932 // All assembler immediates are 64-bit integers. 5933 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64); 5934 break; 5935 } 5936 5937 if (Result.getNode()) { 5938 Ops.push_back(Result); 5939 return; 5940 } 5941 5942 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 5943 } 5944 5945 //===----------------------------------------------------------------------===// 5946 // AArch64 Advanced SIMD Support 5947 //===----------------------------------------------------------------------===// 5948 5949 /// WidenVector - Given a value in the V64 register class, produce the 5950 /// equivalent value in the V128 register class. 5951 static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) { 5952 EVT VT = V64Reg.getValueType(); 5953 unsigned NarrowSize = VT.getVectorNumElements(); 5954 MVT EltTy = VT.getVectorElementType().getSimpleVT(); 5955 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize); 5956 SDLoc DL(V64Reg); 5957 5958 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy), 5959 V64Reg, DAG.getConstant(0, DL, MVT::i32)); 5960 } 5961 5962 /// getExtFactor - Determine the adjustment factor for the position when 5963 /// generating an "extract from vector registers" instruction. 5964 static unsigned getExtFactor(SDValue &V) { 5965 EVT EltType = V.getValueType().getVectorElementType(); 5966 return EltType.getSizeInBits() / 8; 5967 } 5968 5969 /// NarrowVector - Given a value in the V128 register class, produce the 5970 /// equivalent value in the V64 register class. 5971 static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) { 5972 EVT VT = V128Reg.getValueType(); 5973 unsigned WideSize = VT.getVectorNumElements(); 5974 MVT EltTy = VT.getVectorElementType().getSimpleVT(); 5975 MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2); 5976 SDLoc DL(V128Reg); 5977 5978 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg); 5979 } 5980 5981 // Gather data to see if the operation can be modelled as a 5982 // shuffle in combination with VEXTs. 5983 SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, 5984 SelectionDAG &DAG) const { 5985 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 5986 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n"); 5987 SDLoc dl(Op); 5988 EVT VT = Op.getValueType(); 5989 unsigned NumElts = VT.getVectorNumElements(); 5990 5991 struct ShuffleSourceInfo { 5992 SDValue Vec; 5993 unsigned MinElt; 5994 unsigned MaxElt; 5995 5996 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to 5997 // be compatible with the shuffle we intend to construct. As a result 5998 // ShuffleVec will be some sliding window into the original Vec. 5999 SDValue ShuffleVec; 6000 6001 // Code should guarantee that element i in Vec starts at element "WindowBase 6002 // + i * WindowScale in ShuffleVec". 6003 int WindowBase; 6004 int WindowScale; 6005 6006 ShuffleSourceInfo(SDValue Vec) 6007 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0), 6008 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {} 6009 6010 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } 6011 }; 6012 6013 // First gather all vectors used as an immediate source for this BUILD_VECTOR 6014 // node. 6015 SmallVector<ShuffleSourceInfo, 2> Sources; 6016 for (unsigned i = 0; i < NumElts; ++i) { 6017 SDValue V = Op.getOperand(i); 6018 if (V.isUndef()) 6019 continue; 6020 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 6021 !isa<ConstantSDNode>(V.getOperand(1))) { 6022 LLVM_DEBUG( 6023 dbgs() << "Reshuffle failed: " 6024 "a shuffle can only come from building a vector from " 6025 "various elements of other vectors, provided their " 6026 "indices are constant\n"); 6027 return SDValue(); 6028 } 6029 6030 // Add this element source to the list if it's not already there. 6031 SDValue SourceVec = V.getOperand(0); 6032 auto Source = find(Sources, SourceVec); 6033 if (Source == Sources.end()) 6034 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); 6035 6036 // Update the minimum and maximum lane number seen. 6037 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); 6038 Source->MinElt = std::min(Source->MinElt, EltNo); 6039 Source->MaxElt = std::max(Source->MaxElt, EltNo); 6040 } 6041 6042 if (Sources.size() > 2) { 6043 LLVM_DEBUG( 6044 dbgs() << "Reshuffle failed: currently only do something sane when at " 6045 "most two source vectors are involved\n"); 6046 return SDValue(); 6047 } 6048 6049 // Find out the smallest element size among result and two sources, and use 6050 // it as element size to build the shuffle_vector. 6051 EVT SmallestEltTy = VT.getVectorElementType(); 6052 for (auto &Source : Sources) { 6053 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); 6054 if (SrcEltTy.bitsLT(SmallestEltTy)) { 6055 SmallestEltTy = SrcEltTy; 6056 } 6057 } 6058 unsigned ResMultiplier = 6059 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits(); 6060 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); 6061 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); 6062 6063 // If the source vector is too wide or too narrow, we may nevertheless be able 6064 // to construct a compatible shuffle either by concatenating it with UNDEF or 6065 // extracting a suitable range of elements. 6066 for (auto &Src : Sources) { 6067 EVT SrcVT = Src.ShuffleVec.getValueType(); 6068 6069 if (SrcVT.getSizeInBits() == VT.getSizeInBits()) 6070 continue; 6071 6072 // This stage of the search produces a source with the same element type as 6073 // the original, but with a total width matching the BUILD_VECTOR output. 6074 EVT EltVT = SrcVT.getVectorElementType(); 6075 unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); 6076 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); 6077 6078 if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { 6079 assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits()); 6080 // We can pad out the smaller vector for free, so if it's part of a 6081 // shuffle... 6082 Src.ShuffleVec = 6083 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, 6084 DAG.getUNDEF(Src.ShuffleVec.getValueType())); 6085 continue; 6086 } 6087 6088 assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits()); 6089 6090 if (Src.MaxElt - Src.MinElt >= NumSrcElts) { 6091 LLVM_DEBUG( 6092 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n"); 6093 return SDValue(); 6094 } 6095 6096 if (Src.MinElt >= NumSrcElts) { 6097 // The extraction can just take the second half 6098 Src.ShuffleVec = 6099 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 6100 DAG.getConstant(NumSrcElts, dl, MVT::i64)); 6101 Src.WindowBase = -NumSrcElts; 6102 } else if (Src.MaxElt < NumSrcElts) { 6103 // The extraction can just take the first half 6104 Src.ShuffleVec = 6105 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 6106 DAG.getConstant(0, dl, MVT::i64)); 6107 } else { 6108 // An actual VEXT is needed 6109 SDValue VEXTSrc1 = 6110 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 6111 DAG.getConstant(0, dl, MVT::i64)); 6112 SDValue VEXTSrc2 = 6113 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 6114 DAG.getConstant(NumSrcElts, dl, MVT::i64)); 6115 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1); 6116 6117 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1, 6118 VEXTSrc2, 6119 DAG.getConstant(Imm, dl, MVT::i32)); 6120 Src.WindowBase = -Src.MinElt; 6121 } 6122 } 6123 6124 // Another possible incompatibility occurs from the vector element types. We 6125 // can fix this by bitcasting the source vectors to the same type we intend 6126 // for the shuffle. 6127 for (auto &Src : Sources) { 6128 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); 6129 if (SrcEltTy == SmallestEltTy) 6130 continue; 6131 assert(ShuffleVT.getVectorElementType() == SmallestEltTy); 6132 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); 6133 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); 6134 Src.WindowBase *= Src.WindowScale; 6135 } 6136 6137 // Final sanity check before we try to actually produce a shuffle. 6138 LLVM_DEBUG(for (auto Src 6139 : Sources) 6140 assert(Src.ShuffleVec.getValueType() == ShuffleVT);); 6141 6142 // The stars all align, our next step is to produce the mask for the shuffle. 6143 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); 6144 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits(); 6145 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { 6146 SDValue Entry = Op.getOperand(i); 6147 if (Entry.isUndef()) 6148 continue; 6149 6150 auto Src = find(Sources, Entry.getOperand(0)); 6151 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); 6152 6153 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit 6154 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this 6155 // segment. 6156 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); 6157 int BitsDefined = 6158 std::min(OrigEltTy.getSizeInBits(), VT.getScalarSizeInBits()); 6159 int LanesDefined = BitsDefined / BitsPerShuffleLane; 6160 6161 // This source is expected to fill ResMultiplier lanes of the final shuffle, 6162 // starting at the appropriate offset. 6163 int *LaneMask = &Mask[i * ResMultiplier]; 6164 6165 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; 6166 ExtractBase += NumElts * (Src - Sources.begin()); 6167 for (int j = 0; j < LanesDefined; ++j) 6168 LaneMask[j] = ExtractBase + j; 6169 } 6170 6171 // Final check before we try to produce nonsense... 6172 if (!isShuffleMaskLegal(Mask, ShuffleVT)) { 6173 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n"); 6174 return SDValue(); 6175 } 6176 6177 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; 6178 for (unsigned i = 0; i < Sources.size(); ++i) 6179 ShuffleOps[i] = Sources[i].ShuffleVec; 6180 6181 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], 6182 ShuffleOps[1], Mask); 6183 SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); 6184 6185 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump(); 6186 dbgs() << "Reshuffle, creating node: "; V.dump();); 6187 6188 return V; 6189 } 6190 6191 // check if an EXT instruction can handle the shuffle mask when the 6192 // vector sources of the shuffle are the same. 6193 static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { 6194 unsigned NumElts = VT.getVectorNumElements(); 6195 6196 // Assume that the first shuffle index is not UNDEF. Fail if it is. 6197 if (M[0] < 0) 6198 return false; 6199 6200 Imm = M[0]; 6201 6202 // If this is a VEXT shuffle, the immediate value is the index of the first 6203 // element. The other shuffle indices must be the successive elements after 6204 // the first one. 6205 unsigned ExpectedElt = Imm; 6206 for (unsigned i = 1; i < NumElts; ++i) { 6207 // Increment the expected index. If it wraps around, just follow it 6208 // back to index zero and keep going. 6209 ++ExpectedElt; 6210 if (ExpectedElt == NumElts) 6211 ExpectedElt = 0; 6212 6213 if (M[i] < 0) 6214 continue; // ignore UNDEF indices 6215 if (ExpectedElt != static_cast<unsigned>(M[i])) 6216 return false; 6217 } 6218 6219 return true; 6220 } 6221 6222 // check if an EXT instruction can handle the shuffle mask when the 6223 // vector sources of the shuffle are different. 6224 static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT, 6225 unsigned &Imm) { 6226 // Look for the first non-undef element. 6227 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; }); 6228 6229 // Benefit form APInt to handle overflow when calculating expected element. 6230 unsigned NumElts = VT.getVectorNumElements(); 6231 unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); 6232 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1); 6233 // The following shuffle indices must be the successive elements after the 6234 // first real element. 6235 const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(), 6236 [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;}); 6237 if (FirstWrongElt != M.end()) 6238 return false; 6239 6240 // The index of an EXT is the first element if it is not UNDEF. 6241 // Watch out for the beginning UNDEFs. The EXT index should be the expected 6242 // value of the first element. E.g. 6243 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. 6244 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. 6245 // ExpectedElt is the last mask index plus 1. 6246 Imm = ExpectedElt.getZExtValue(); 6247 6248 // There are two difference cases requiring to reverse input vectors. 6249 // For example, for vector <4 x i32> we have the following cases, 6250 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>) 6251 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>) 6252 // For both cases, we finally use mask <5, 6, 7, 0>, which requires 6253 // to reverse two input vectors. 6254 if (Imm < NumElts) 6255 ReverseEXT = true; 6256 else 6257 Imm -= NumElts; 6258 6259 return true; 6260 } 6261 6262 /// isREVMask - Check if a vector shuffle corresponds to a REV 6263 /// instruction with the specified blocksize. (The order of the elements 6264 /// within each block of the vector is reversed.) 6265 static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { 6266 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && 6267 "Only possible block sizes for REV are: 16, 32, 64"); 6268 6269 unsigned EltSz = VT.getScalarSizeInBits(); 6270 if (EltSz == 64) 6271 return false; 6272 6273 unsigned NumElts = VT.getVectorNumElements(); 6274 unsigned BlockElts = M[0] + 1; 6275 // If the first shuffle index is UNDEF, be optimistic. 6276 if (M[0] < 0) 6277 BlockElts = BlockSize / EltSz; 6278 6279 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) 6280 return false; 6281 6282 for (unsigned i = 0; i < NumElts; ++i) { 6283 if (M[i] < 0) 6284 continue; // ignore UNDEF indices 6285 if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) 6286 return false; 6287 } 6288 6289 return true; 6290 } 6291 6292 static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6293 unsigned NumElts = VT.getVectorNumElements(); 6294 WhichResult = (M[0] == 0 ? 0 : 1); 6295 unsigned Idx = WhichResult * NumElts / 2; 6296 for (unsigned i = 0; i != NumElts; i += 2) { 6297 if ((M[i] >= 0 && (unsigned)M[i] != Idx) || 6298 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts)) 6299 return false; 6300 Idx += 1; 6301 } 6302 6303 return true; 6304 } 6305 6306 static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6307 unsigned NumElts = VT.getVectorNumElements(); 6308 WhichResult = (M[0] == 0 ? 0 : 1); 6309 for (unsigned i = 0; i != NumElts; ++i) { 6310 if (M[i] < 0) 6311 continue; // ignore UNDEF indices 6312 if ((unsigned)M[i] != 2 * i + WhichResult) 6313 return false; 6314 } 6315 6316 return true; 6317 } 6318 6319 static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6320 unsigned NumElts = VT.getVectorNumElements(); 6321 if (NumElts % 2 != 0) 6322 return false; 6323 WhichResult = (M[0] == 0 ? 0 : 1); 6324 for (unsigned i = 0; i < NumElts; i += 2) { 6325 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || 6326 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult)) 6327 return false; 6328 } 6329 return true; 6330 } 6331 6332 /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of 6333 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 6334 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. 6335 static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6336 unsigned NumElts = VT.getVectorNumElements(); 6337 if (NumElts % 2 != 0) 6338 return false; 6339 WhichResult = (M[0] == 0 ? 0 : 1); 6340 unsigned Idx = WhichResult * NumElts / 2; 6341 for (unsigned i = 0; i != NumElts; i += 2) { 6342 if ((M[i] >= 0 && (unsigned)M[i] != Idx) || 6343 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx)) 6344 return false; 6345 Idx += 1; 6346 } 6347 6348 return true; 6349 } 6350 6351 /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of 6352 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 6353 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, 6354 static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6355 unsigned Half = VT.getVectorNumElements() / 2; 6356 WhichResult = (M[0] == 0 ? 0 : 1); 6357 for (unsigned j = 0; j != 2; ++j) { 6358 unsigned Idx = WhichResult; 6359 for (unsigned i = 0; i != Half; ++i) { 6360 int MIdx = M[i + j * Half]; 6361 if (MIdx >= 0 && (unsigned)MIdx != Idx) 6362 return false; 6363 Idx += 2; 6364 } 6365 } 6366 6367 return true; 6368 } 6369 6370 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of 6371 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 6372 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. 6373 static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 6374 unsigned NumElts = VT.getVectorNumElements(); 6375 if (NumElts % 2 != 0) 6376 return false; 6377 WhichResult = (M[0] == 0 ? 0 : 1); 6378 for (unsigned i = 0; i < NumElts; i += 2) { 6379 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || 6380 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult)) 6381 return false; 6382 } 6383 return true; 6384 } 6385 6386 static bool isINSMask(ArrayRef<int> M, int NumInputElements, 6387 bool &DstIsLeft, int &Anomaly) { 6388 if (M.size() != static_cast<size_t>(NumInputElements)) 6389 return false; 6390 6391 int NumLHSMatch = 0, NumRHSMatch = 0; 6392 int LastLHSMismatch = -1, LastRHSMismatch = -1; 6393 6394 for (int i = 0; i < NumInputElements; ++i) { 6395 if (M[i] == -1) { 6396 ++NumLHSMatch; 6397 ++NumRHSMatch; 6398 continue; 6399 } 6400 6401 if (M[i] == i) 6402 ++NumLHSMatch; 6403 else 6404 LastLHSMismatch = i; 6405 6406 if (M[i] == i + NumInputElements) 6407 ++NumRHSMatch; 6408 else 6409 LastRHSMismatch = i; 6410 } 6411 6412 if (NumLHSMatch == NumInputElements - 1) { 6413 DstIsLeft = true; 6414 Anomaly = LastLHSMismatch; 6415 return true; 6416 } else if (NumRHSMatch == NumInputElements - 1) { 6417 DstIsLeft = false; 6418 Anomaly = LastRHSMismatch; 6419 return true; 6420 } 6421 6422 return false; 6423 } 6424 6425 static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) { 6426 if (VT.getSizeInBits() != 128) 6427 return false; 6428 6429 unsigned NumElts = VT.getVectorNumElements(); 6430 6431 for (int I = 0, E = NumElts / 2; I != E; I++) { 6432 if (Mask[I] != I) 6433 return false; 6434 } 6435 6436 int Offset = NumElts / 2; 6437 for (int I = NumElts / 2, E = NumElts; I != E; I++) { 6438 if (Mask[I] != I + SplitLHS * Offset) 6439 return false; 6440 } 6441 6442 return true; 6443 } 6444 6445 static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { 6446 SDLoc DL(Op); 6447 EVT VT = Op.getValueType(); 6448 SDValue V0 = Op.getOperand(0); 6449 SDValue V1 = Op.getOperand(1); 6450 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask(); 6451 6452 if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() || 6453 VT.getVectorElementType() != V1.getValueType().getVectorElementType()) 6454 return SDValue(); 6455 6456 bool SplitV0 = V0.getValueSizeInBits() == 128; 6457 6458 if (!isConcatMask(Mask, VT, SplitV0)) 6459 return SDValue(); 6460 6461 EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 6462 VT.getVectorNumElements() / 2); 6463 if (SplitV0) { 6464 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0, 6465 DAG.getConstant(0, DL, MVT::i64)); 6466 } 6467 if (V1.getValueSizeInBits() == 128) { 6468 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1, 6469 DAG.getConstant(0, DL, MVT::i64)); 6470 } 6471 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1); 6472 } 6473 6474 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 6475 /// the specified operations to build the shuffle. 6476 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 6477 SDValue RHS, SelectionDAG &DAG, 6478 const SDLoc &dl) { 6479 unsigned OpNum = (PFEntry >> 26) & 0x0F; 6480 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1); 6481 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1); 6482 6483 enum { 6484 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 6485 OP_VREV, 6486 OP_VDUP0, 6487 OP_VDUP1, 6488 OP_VDUP2, 6489 OP_VDUP3, 6490 OP_VEXT1, 6491 OP_VEXT2, 6492 OP_VEXT3, 6493 OP_VUZPL, // VUZP, left result 6494 OP_VUZPR, // VUZP, right result 6495 OP_VZIPL, // VZIP, left result 6496 OP_VZIPR, // VZIP, right result 6497 OP_VTRNL, // VTRN, left result 6498 OP_VTRNR // VTRN, right result 6499 }; 6500 6501 if (OpNum == OP_COPY) { 6502 if (LHSID == (1 * 9 + 2) * 9 + 3) 6503 return LHS; 6504 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!"); 6505 return RHS; 6506 } 6507 6508 SDValue OpLHS, OpRHS; 6509 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 6510 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 6511 EVT VT = OpLHS.getValueType(); 6512 6513 switch (OpNum) { 6514 default: 6515 llvm_unreachable("Unknown shuffle opcode!"); 6516 case OP_VREV: 6517 // VREV divides the vector in half and swaps within the half. 6518 if (VT.getVectorElementType() == MVT::i32 || 6519 VT.getVectorElementType() == MVT::f32) 6520 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS); 6521 // vrev <4 x i16> -> REV32 6522 if (VT.getVectorElementType() == MVT::i16 || 6523 VT.getVectorElementType() == MVT::f16) 6524 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS); 6525 // vrev <4 x i8> -> REV16 6526 assert(VT.getVectorElementType() == MVT::i8); 6527 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS); 6528 case OP_VDUP0: 6529 case OP_VDUP1: 6530 case OP_VDUP2: 6531 case OP_VDUP3: { 6532 EVT EltTy = VT.getVectorElementType(); 6533 unsigned Opcode; 6534 if (EltTy == MVT::i8) 6535 Opcode = AArch64ISD::DUPLANE8; 6536 else if (EltTy == MVT::i16 || EltTy == MVT::f16) 6537 Opcode = AArch64ISD::DUPLANE16; 6538 else if (EltTy == MVT::i32 || EltTy == MVT::f32) 6539 Opcode = AArch64ISD::DUPLANE32; 6540 else if (EltTy == MVT::i64 || EltTy == MVT::f64) 6541 Opcode = AArch64ISD::DUPLANE64; 6542 else 6543 llvm_unreachable("Invalid vector element type?"); 6544 6545 if (VT.getSizeInBits() == 64) 6546 OpLHS = WidenVector(OpLHS, DAG); 6547 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64); 6548 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane); 6549 } 6550 case OP_VEXT1: 6551 case OP_VEXT2: 6552 case OP_VEXT3: { 6553 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS); 6554 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS, 6555 DAG.getConstant(Imm, dl, MVT::i32)); 6556 } 6557 case OP_VUZPL: 6558 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS, 6559 OpRHS); 6560 case OP_VUZPR: 6561 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS, 6562 OpRHS); 6563 case OP_VZIPL: 6564 return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS, 6565 OpRHS); 6566 case OP_VZIPR: 6567 return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS, 6568 OpRHS); 6569 case OP_VTRNL: 6570 return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS, 6571 OpRHS); 6572 case OP_VTRNR: 6573 return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS, 6574 OpRHS); 6575 } 6576 } 6577 6578 static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask, 6579 SelectionDAG &DAG) { 6580 // Check to see if we can use the TBL instruction. 6581 SDValue V1 = Op.getOperand(0); 6582 SDValue V2 = Op.getOperand(1); 6583 SDLoc DL(Op); 6584 6585 EVT EltVT = Op.getValueType().getVectorElementType(); 6586 unsigned BytesPerElt = EltVT.getSizeInBits() / 8; 6587 6588 SmallVector<SDValue, 8> TBLMask; 6589 for (int Val : ShuffleMask) { 6590 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { 6591 unsigned Offset = Byte + Val * BytesPerElt; 6592 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32)); 6593 } 6594 } 6595 6596 MVT IndexVT = MVT::v8i8; 6597 unsigned IndexLen = 8; 6598 if (Op.getValueSizeInBits() == 128) { 6599 IndexVT = MVT::v16i8; 6600 IndexLen = 16; 6601 } 6602 6603 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1); 6604 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2); 6605 6606 SDValue Shuffle; 6607 if (V2.getNode()->isUndef()) { 6608 if (IndexLen == 8) 6609 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); 6610 Shuffle = DAG.getNode( 6611 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 6612 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, 6613 DAG.getBuildVector(IndexVT, DL, 6614 makeArrayRef(TBLMask.data(), IndexLen))); 6615 } else { 6616 if (IndexLen == 8) { 6617 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst); 6618 Shuffle = DAG.getNode( 6619 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 6620 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, 6621 DAG.getBuildVector(IndexVT, DL, 6622 makeArrayRef(TBLMask.data(), IndexLen))); 6623 } else { 6624 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we 6625 // cannot currently represent the register constraints on the input 6626 // table registers. 6627 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst, 6628 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0], 6629 // IndexLen)); 6630 Shuffle = DAG.getNode( 6631 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 6632 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst, 6633 V2Cst, DAG.getBuildVector(IndexVT, DL, 6634 makeArrayRef(TBLMask.data(), IndexLen))); 6635 } 6636 } 6637 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); 6638 } 6639 6640 static unsigned getDUPLANEOp(EVT EltType) { 6641 if (EltType == MVT::i8) 6642 return AArch64ISD::DUPLANE8; 6643 if (EltType == MVT::i16 || EltType == MVT::f16) 6644 return AArch64ISD::DUPLANE16; 6645 if (EltType == MVT::i32 || EltType == MVT::f32) 6646 return AArch64ISD::DUPLANE32; 6647 if (EltType == MVT::i64 || EltType == MVT::f64) 6648 return AArch64ISD::DUPLANE64; 6649 6650 llvm_unreachable("Invalid vector element type?"); 6651 } 6652 6653 SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, 6654 SelectionDAG &DAG) const { 6655 SDLoc dl(Op); 6656 EVT VT = Op.getValueType(); 6657 6658 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 6659 6660 // Convert shuffles that are directly supported on NEON to target-specific 6661 // DAG nodes, instead of keeping them as shuffles and matching them again 6662 // during code selection. This is more efficient and avoids the possibility 6663 // of inconsistencies between legalization and selection. 6664 ArrayRef<int> ShuffleMask = SVN->getMask(); 6665 6666 SDValue V1 = Op.getOperand(0); 6667 SDValue V2 = Op.getOperand(1); 6668 6669 if (SVN->isSplat()) { 6670 int Lane = SVN->getSplatIndex(); 6671 // If this is undef splat, generate it via "just" vdup, if possible. 6672 if (Lane == -1) 6673 Lane = 0; 6674 6675 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) 6676 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(), 6677 V1.getOperand(0)); 6678 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non- 6679 // constant. If so, we can just reference the lane's definition directly. 6680 if (V1.getOpcode() == ISD::BUILD_VECTOR && 6681 !isa<ConstantSDNode>(V1.getOperand(Lane))) 6682 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane)); 6683 6684 // Otherwise, duplicate from the lane of the input vector. 6685 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType()); 6686 6687 // SelectionDAGBuilder may have "helpfully" already extracted or conatenated 6688 // to make a vector of the same size as this SHUFFLE. We can ignore the 6689 // extract entirely, and canonicalise the concat using WidenVector. 6690 if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) { 6691 Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue(); 6692 V1 = V1.getOperand(0); 6693 } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) { 6694 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2; 6695 Lane -= Idx * VT.getVectorNumElements() / 2; 6696 V1 = WidenVector(V1.getOperand(Idx), DAG); 6697 } else if (VT.getSizeInBits() == 64) 6698 V1 = WidenVector(V1, DAG); 6699 6700 return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64)); 6701 } 6702 6703 if (isREVMask(ShuffleMask, VT, 64)) 6704 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2); 6705 if (isREVMask(ShuffleMask, VT, 32)) 6706 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2); 6707 if (isREVMask(ShuffleMask, VT, 16)) 6708 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2); 6709 6710 bool ReverseEXT = false; 6711 unsigned Imm; 6712 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) { 6713 if (ReverseEXT) 6714 std::swap(V1, V2); 6715 Imm *= getExtFactor(V1); 6716 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2, 6717 DAG.getConstant(Imm, dl, MVT::i32)); 6718 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) { 6719 Imm *= getExtFactor(V1); 6720 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1, 6721 DAG.getConstant(Imm, dl, MVT::i32)); 6722 } 6723 6724 unsigned WhichResult; 6725 if (isZIPMask(ShuffleMask, VT, WhichResult)) { 6726 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; 6727 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 6728 } 6729 if (isUZPMask(ShuffleMask, VT, WhichResult)) { 6730 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; 6731 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 6732 } 6733 if (isTRNMask(ShuffleMask, VT, WhichResult)) { 6734 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; 6735 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 6736 } 6737 6738 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 6739 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; 6740 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 6741 } 6742 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 6743 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; 6744 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 6745 } 6746 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 6747 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; 6748 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 6749 } 6750 6751 if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG)) 6752 return Concat; 6753 6754 bool DstIsLeft; 6755 int Anomaly; 6756 int NumInputElements = V1.getValueType().getVectorNumElements(); 6757 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) { 6758 SDValue DstVec = DstIsLeft ? V1 : V2; 6759 SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64); 6760 6761 SDValue SrcVec = V1; 6762 int SrcLane = ShuffleMask[Anomaly]; 6763 if (SrcLane >= NumInputElements) { 6764 SrcVec = V2; 6765 SrcLane -= VT.getVectorNumElements(); 6766 } 6767 SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64); 6768 6769 EVT ScalarVT = VT.getVectorElementType(); 6770 6771 if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger()) 6772 ScalarVT = MVT::i32; 6773 6774 return DAG.getNode( 6775 ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, 6776 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV), 6777 DstLaneV); 6778 } 6779 6780 // If the shuffle is not directly supported and it has 4 elements, use 6781 // the PerfectShuffle-generated table to synthesize it from other shuffles. 6782 unsigned NumElts = VT.getVectorNumElements(); 6783 if (NumElts == 4) { 6784 unsigned PFIndexes[4]; 6785 for (unsigned i = 0; i != 4; ++i) { 6786 if (ShuffleMask[i] < 0) 6787 PFIndexes[i] = 8; 6788 else 6789 PFIndexes[i] = ShuffleMask[i]; 6790 } 6791 6792 // Compute the index in the perfect shuffle table. 6793 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + 6794 PFIndexes[2] * 9 + PFIndexes[3]; 6795 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 6796 unsigned Cost = (PFEntry >> 30); 6797 6798 if (Cost <= 4) 6799 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 6800 } 6801 6802 return GenerateTBL(Op, ShuffleMask, DAG); 6803 } 6804 6805 static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, 6806 APInt &UndefBits) { 6807 EVT VT = BVN->getValueType(0); 6808 APInt SplatBits, SplatUndef; 6809 unsigned SplatBitSize; 6810 bool HasAnyUndefs; 6811 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 6812 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize; 6813 6814 for (unsigned i = 0; i < NumSplats; ++i) { 6815 CnstBits <<= SplatBitSize; 6816 UndefBits <<= SplatBitSize; 6817 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits()); 6818 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits()); 6819 } 6820 6821 return true; 6822 } 6823 6824 return false; 6825 } 6826 6827 // Try 64-bit splatted SIMD immediate. 6828 static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, 6829 const APInt &Bits) { 6830 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 6831 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 6832 EVT VT = Op.getValueType(); 6833 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64; 6834 6835 if (AArch64_AM::isAdvSIMDModImmType10(Value)) { 6836 Value = AArch64_AM::encodeAdvSIMDModImmType10(Value); 6837 6838 SDLoc dl(Op); 6839 SDValue Mov = DAG.getNode(NewOp, dl, MovTy, 6840 DAG.getConstant(Value, dl, MVT::i32)); 6841 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6842 } 6843 } 6844 6845 return SDValue(); 6846 } 6847 6848 // Try 32-bit splatted SIMD immediate. 6849 static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, 6850 const APInt &Bits, 6851 const SDValue *LHS = nullptr) { 6852 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 6853 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 6854 EVT VT = Op.getValueType(); 6855 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6856 bool isAdvSIMDModImm = false; 6857 uint64_t Shift; 6858 6859 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) { 6860 Value = AArch64_AM::encodeAdvSIMDModImmType1(Value); 6861 Shift = 0; 6862 } 6863 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) { 6864 Value = AArch64_AM::encodeAdvSIMDModImmType2(Value); 6865 Shift = 8; 6866 } 6867 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) { 6868 Value = AArch64_AM::encodeAdvSIMDModImmType3(Value); 6869 Shift = 16; 6870 } 6871 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) { 6872 Value = AArch64_AM::encodeAdvSIMDModImmType4(Value); 6873 Shift = 24; 6874 } 6875 6876 if (isAdvSIMDModImm) { 6877 SDLoc dl(Op); 6878 SDValue Mov; 6879 6880 if (LHS) 6881 Mov = DAG.getNode(NewOp, dl, MovTy, *LHS, 6882 DAG.getConstant(Value, dl, MVT::i32), 6883 DAG.getConstant(Shift, dl, MVT::i32)); 6884 else 6885 Mov = DAG.getNode(NewOp, dl, MovTy, 6886 DAG.getConstant(Value, dl, MVT::i32), 6887 DAG.getConstant(Shift, dl, MVT::i32)); 6888 6889 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6890 } 6891 } 6892 6893 return SDValue(); 6894 } 6895 6896 // Try 16-bit splatted SIMD immediate. 6897 static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, 6898 const APInt &Bits, 6899 const SDValue *LHS = nullptr) { 6900 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 6901 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 6902 EVT VT = Op.getValueType(); 6903 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 6904 bool isAdvSIMDModImm = false; 6905 uint64_t Shift; 6906 6907 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) { 6908 Value = AArch64_AM::encodeAdvSIMDModImmType5(Value); 6909 Shift = 0; 6910 } 6911 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) { 6912 Value = AArch64_AM::encodeAdvSIMDModImmType6(Value); 6913 Shift = 8; 6914 } 6915 6916 if (isAdvSIMDModImm) { 6917 SDLoc dl(Op); 6918 SDValue Mov; 6919 6920 if (LHS) 6921 Mov = DAG.getNode(NewOp, dl, MovTy, *LHS, 6922 DAG.getConstant(Value, dl, MVT::i32), 6923 DAG.getConstant(Shift, dl, MVT::i32)); 6924 else 6925 Mov = DAG.getNode(NewOp, dl, MovTy, 6926 DAG.getConstant(Value, dl, MVT::i32), 6927 DAG.getConstant(Shift, dl, MVT::i32)); 6928 6929 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6930 } 6931 } 6932 6933 return SDValue(); 6934 } 6935 6936 // Try 32-bit splatted SIMD immediate with shifted ones. 6937 static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, 6938 SelectionDAG &DAG, const APInt &Bits) { 6939 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 6940 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 6941 EVT VT = Op.getValueType(); 6942 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 6943 bool isAdvSIMDModImm = false; 6944 uint64_t Shift; 6945 6946 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) { 6947 Value = AArch64_AM::encodeAdvSIMDModImmType7(Value); 6948 Shift = 264; 6949 } 6950 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) { 6951 Value = AArch64_AM::encodeAdvSIMDModImmType8(Value); 6952 Shift = 272; 6953 } 6954 6955 if (isAdvSIMDModImm) { 6956 SDLoc dl(Op); 6957 SDValue Mov = DAG.getNode(NewOp, dl, MovTy, 6958 DAG.getConstant(Value, dl, MVT::i32), 6959 DAG.getConstant(Shift, dl, MVT::i32)); 6960 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6961 } 6962 } 6963 6964 return SDValue(); 6965 } 6966 6967 // Try 8-bit splatted SIMD immediate. 6968 static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, 6969 const APInt &Bits) { 6970 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 6971 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 6972 EVT VT = Op.getValueType(); 6973 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8; 6974 6975 if (AArch64_AM::isAdvSIMDModImmType9(Value)) { 6976 Value = AArch64_AM::encodeAdvSIMDModImmType9(Value); 6977 6978 SDLoc dl(Op); 6979 SDValue Mov = DAG.getNode(NewOp, dl, MovTy, 6980 DAG.getConstant(Value, dl, MVT::i32)); 6981 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 6982 } 6983 } 6984 6985 return SDValue(); 6986 } 6987 6988 // Try FP splatted SIMD immediate. 6989 static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, 6990 const APInt &Bits) { 6991 if (Bits.getHiBits(64) == Bits.getLoBits(64)) { 6992 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); 6993 EVT VT = Op.getValueType(); 6994 bool isWide = (VT.getSizeInBits() == 128); 6995 MVT MovTy; 6996 bool isAdvSIMDModImm = false; 6997 6998 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) { 6999 Value = AArch64_AM::encodeAdvSIMDModImmType11(Value); 7000 MovTy = isWide ? MVT::v4f32 : MVT::v2f32; 7001 } 7002 else if (isWide && 7003 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) { 7004 Value = AArch64_AM::encodeAdvSIMDModImmType12(Value); 7005 MovTy = MVT::v2f64; 7006 } 7007 7008 if (isAdvSIMDModImm) { 7009 SDLoc dl(Op); 7010 SDValue Mov = DAG.getNode(NewOp, dl, MovTy, 7011 DAG.getConstant(Value, dl, MVT::i32)); 7012 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 7013 } 7014 } 7015 7016 return SDValue(); 7017 } 7018 7019 // Specialized code to quickly find if PotentialBVec is a BuildVector that 7020 // consists of only the same constant int value, returned in reference arg 7021 // ConstVal 7022 static bool isAllConstantBuildVector(const SDValue &PotentialBVec, 7023 uint64_t &ConstVal) { 7024 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec); 7025 if (!Bvec) 7026 return false; 7027 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0)); 7028 if (!FirstElt) 7029 return false; 7030 EVT VT = Bvec->getValueType(0); 7031 unsigned NumElts = VT.getVectorNumElements(); 7032 for (unsigned i = 1; i < NumElts; ++i) 7033 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt) 7034 return false; 7035 ConstVal = FirstElt->getZExtValue(); 7036 return true; 7037 } 7038 7039 static unsigned getIntrinsicID(const SDNode *N) { 7040 unsigned Opcode = N->getOpcode(); 7041 switch (Opcode) { 7042 default: 7043 return Intrinsic::not_intrinsic; 7044 case ISD::INTRINSIC_WO_CHAIN: { 7045 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 7046 if (IID < Intrinsic::num_intrinsics) 7047 return IID; 7048 return Intrinsic::not_intrinsic; 7049 } 7050 } 7051 } 7052 7053 // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)), 7054 // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a 7055 // BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2. 7056 // Also, logical shift right -> sri, with the same structure. 7057 static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { 7058 EVT VT = N->getValueType(0); 7059 7060 if (!VT.isVector()) 7061 return SDValue(); 7062 7063 SDLoc DL(N); 7064 7065 // Is the first op an AND? 7066 const SDValue And = N->getOperand(0); 7067 if (And.getOpcode() != ISD::AND) 7068 return SDValue(); 7069 7070 // Is the second op an shl or lshr? 7071 SDValue Shift = N->getOperand(1); 7072 // This will have been turned into: AArch64ISD::VSHL vector, #shift 7073 // or AArch64ISD::VLSHR vector, #shift 7074 unsigned ShiftOpc = Shift.getOpcode(); 7075 if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR)) 7076 return SDValue(); 7077 bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR; 7078 7079 // Is the shift amount constant? 7080 ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); 7081 if (!C2node) 7082 return SDValue(); 7083 7084 // Is the and mask vector all constant? 7085 uint64_t C1; 7086 if (!isAllConstantBuildVector(And.getOperand(1), C1)) 7087 return SDValue(); 7088 7089 // Is C1 == ~C2, taking into account how much one can shift elements of a 7090 // particular size? 7091 uint64_t C2 = C2node->getZExtValue(); 7092 unsigned ElemSizeInBits = VT.getScalarSizeInBits(); 7093 if (C2 > ElemSizeInBits) 7094 return SDValue(); 7095 unsigned ElemMask = (1 << ElemSizeInBits) - 1; 7096 if ((C1 & ElemMask) != (~C2 & ElemMask)) 7097 return SDValue(); 7098 7099 SDValue X = And.getOperand(0); 7100 SDValue Y = Shift.getOperand(0); 7101 7102 unsigned Intrin = 7103 IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli; 7104 SDValue ResultSLI = 7105 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 7106 DAG.getConstant(Intrin, DL, MVT::i32), X, Y, 7107 Shift.getOperand(1)); 7108 7109 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n"); 7110 LLVM_DEBUG(N->dump(&DAG)); 7111 LLVM_DEBUG(dbgs() << "into: \n"); 7112 LLVM_DEBUG(ResultSLI->dump(&DAG)); 7113 7114 ++NumShiftInserts; 7115 return ResultSLI; 7116 } 7117 7118 SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, 7119 SelectionDAG &DAG) const { 7120 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) 7121 if (EnableAArch64SlrGeneration) { 7122 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG)) 7123 return Res; 7124 } 7125 7126 EVT VT = Op.getValueType(); 7127 7128 SDValue LHS = Op.getOperand(0); 7129 BuildVectorSDNode *BVN = 7130 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode()); 7131 if (!BVN) { 7132 // OR commutes, so try swapping the operands. 7133 LHS = Op.getOperand(1); 7134 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode()); 7135 } 7136 if (!BVN) 7137 return Op; 7138 7139 APInt DefBits(VT.getSizeInBits(), 0); 7140 APInt UndefBits(VT.getSizeInBits(), 0); 7141 if (resolveBuildVector(BVN, DefBits, UndefBits)) { 7142 SDValue NewOp; 7143 7144 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG, 7145 DefBits, &LHS)) || 7146 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG, 7147 DefBits, &LHS))) 7148 return NewOp; 7149 7150 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG, 7151 UndefBits, &LHS)) || 7152 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG, 7153 UndefBits, &LHS))) 7154 return NewOp; 7155 } 7156 7157 // We can always fall back to a non-immediate OR. 7158 return Op; 7159 } 7160 7161 // Normalize the operands of BUILD_VECTOR. The value of constant operands will 7162 // be truncated to fit element width. 7163 static SDValue NormalizeBuildVector(SDValue Op, 7164 SelectionDAG &DAG) { 7165 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 7166 SDLoc dl(Op); 7167 EVT VT = Op.getValueType(); 7168 EVT EltTy= VT.getVectorElementType(); 7169 7170 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16) 7171 return Op; 7172 7173 SmallVector<SDValue, 16> Ops; 7174 for (SDValue Lane : Op->ops()) { 7175 // For integer vectors, type legalization would have promoted the 7176 // operands already. Otherwise, if Op is a floating-point splat 7177 // (with operands cast to integers), then the only possibilities 7178 // are constants and UNDEFs. 7179 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) { 7180 APInt LowBits(EltTy.getSizeInBits(), 7181 CstLane->getZExtValue()); 7182 Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32); 7183 } else if (Lane.getNode()->isUndef()) { 7184 Lane = DAG.getUNDEF(MVT::i32); 7185 } else { 7186 assert(Lane.getValueType() == MVT::i32 && 7187 "Unexpected BUILD_VECTOR operand type"); 7188 } 7189 Ops.push_back(Lane); 7190 } 7191 return DAG.getBuildVector(VT, dl, Ops); 7192 } 7193 7194 static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) { 7195 EVT VT = Op.getValueType(); 7196 7197 APInt DefBits(VT.getSizeInBits(), 0); 7198 APInt UndefBits(VT.getSizeInBits(), 0); 7199 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 7200 if (resolveBuildVector(BVN, DefBits, UndefBits)) { 7201 SDValue NewOp; 7202 if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) || 7203 (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || 7204 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) || 7205 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || 7206 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) || 7207 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits))) 7208 return NewOp; 7209 7210 DefBits = ~DefBits; 7211 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) || 7212 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) || 7213 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits))) 7214 return NewOp; 7215 7216 DefBits = UndefBits; 7217 if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) || 7218 (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || 7219 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) || 7220 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || 7221 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) || 7222 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits))) 7223 return NewOp; 7224 7225 DefBits = ~UndefBits; 7226 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) || 7227 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) || 7228 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits))) 7229 return NewOp; 7230 } 7231 7232 return SDValue(); 7233 } 7234 7235 SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, 7236 SelectionDAG &DAG) const { 7237 EVT VT = Op.getValueType(); 7238 7239 // Try to build a simple constant vector. 7240 Op = NormalizeBuildVector(Op, DAG); 7241 if (VT.isInteger()) { 7242 // Certain vector constants, used to express things like logical NOT and 7243 // arithmetic NEG, are passed through unmodified. This allows special 7244 // patterns for these operations to match, which will lower these constants 7245 // to whatever is proven necessary. 7246 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 7247 if (BVN->isConstant()) 7248 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) { 7249 unsigned BitSize = VT.getVectorElementType().getSizeInBits(); 7250 APInt Val(BitSize, 7251 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue()); 7252 if (Val.isNullValue() || Val.isAllOnesValue()) 7253 return Op; 7254 } 7255 } 7256 7257 if (SDValue V = ConstantBuildVector(Op, DAG)) 7258 return V; 7259 7260 // Scan through the operands to find some interesting properties we can 7261 // exploit: 7262 // 1) If only one value is used, we can use a DUP, or 7263 // 2) if only the low element is not undef, we can just insert that, or 7264 // 3) if only one constant value is used (w/ some non-constant lanes), 7265 // we can splat the constant value into the whole vector then fill 7266 // in the non-constant lanes. 7267 // 4) FIXME: If different constant values are used, but we can intelligently 7268 // select the values we'll be overwriting for the non-constant 7269 // lanes such that we can directly materialize the vector 7270 // some other way (MOVI, e.g.), we can be sneaky. 7271 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP. 7272 SDLoc dl(Op); 7273 unsigned NumElts = VT.getVectorNumElements(); 7274 bool isOnlyLowElement = true; 7275 bool usesOnlyOneValue = true; 7276 bool usesOnlyOneConstantValue = true; 7277 bool isConstant = true; 7278 bool AllLanesExtractElt = true; 7279 unsigned NumConstantLanes = 0; 7280 SDValue Value; 7281 SDValue ConstantValue; 7282 for (unsigned i = 0; i < NumElts; ++i) { 7283 SDValue V = Op.getOperand(i); 7284 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) 7285 AllLanesExtractElt = false; 7286 if (V.isUndef()) 7287 continue; 7288 if (i > 0) 7289 isOnlyLowElement = false; 7290 if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) 7291 isConstant = false; 7292 7293 if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) { 7294 ++NumConstantLanes; 7295 if (!ConstantValue.getNode()) 7296 ConstantValue = V; 7297 else if (ConstantValue != V) 7298 usesOnlyOneConstantValue = false; 7299 } 7300 7301 if (!Value.getNode()) 7302 Value = V; 7303 else if (V != Value) 7304 usesOnlyOneValue = false; 7305 } 7306 7307 if (!Value.getNode()) { 7308 LLVM_DEBUG( 7309 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n"); 7310 return DAG.getUNDEF(VT); 7311 } 7312 7313 // Convert BUILD_VECTOR where all elements but the lowest are undef into 7314 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector 7315 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR. 7316 if (isOnlyLowElement && !(NumElts == 1 && isa<ConstantSDNode>(Value))) { 7317 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 " 7318 "SCALAR_TO_VECTOR node\n"); 7319 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); 7320 } 7321 7322 if (AllLanesExtractElt) { 7323 SDNode *Vector = nullptr; 7324 bool Even = false; 7325 bool Odd = false; 7326 // Check whether the extract elements match the Even pattern <0,2,4,...> or 7327 // the Odd pattern <1,3,5,...>. 7328 for (unsigned i = 0; i < NumElts; ++i) { 7329 SDValue V = Op.getOperand(i); 7330 const SDNode *N = V.getNode(); 7331 if (!isa<ConstantSDNode>(N->getOperand(1))) 7332 break; 7333 SDValue N0 = N->getOperand(0); 7334 7335 // All elements are extracted from the same vector. 7336 if (!Vector) { 7337 Vector = N0.getNode(); 7338 // Check that the type of EXTRACT_VECTOR_ELT matches the type of 7339 // BUILD_VECTOR. 7340 if (VT.getVectorElementType() != 7341 N0.getValueType().getVectorElementType()) 7342 break; 7343 } else if (Vector != N0.getNode()) { 7344 Odd = false; 7345 Even = false; 7346 break; 7347 } 7348 7349 // Extracted values are either at Even indices <0,2,4,...> or at Odd 7350 // indices <1,3,5,...>. 7351 uint64_t Val = N->getConstantOperandVal(1); 7352 if (Val == 2 * i) { 7353 Even = true; 7354 continue; 7355 } 7356 if (Val - 1 == 2 * i) { 7357 Odd = true; 7358 continue; 7359 } 7360 7361 // Something does not match: abort. 7362 Odd = false; 7363 Even = false; 7364 break; 7365 } 7366 if (Even || Odd) { 7367 SDValue LHS = 7368 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0), 7369 DAG.getConstant(0, dl, MVT::i64)); 7370 SDValue RHS = 7371 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0), 7372 DAG.getConstant(NumElts, dl, MVT::i64)); 7373 7374 if (Even && !Odd) 7375 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS, 7376 RHS); 7377 if (Odd && !Even) 7378 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS, 7379 RHS); 7380 } 7381 } 7382 7383 // Use DUP for non-constant splats. For f32 constant splats, reduce to 7384 // i32 and try again. 7385 if (usesOnlyOneValue) { 7386 if (!isConstant) { 7387 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 7388 Value.getValueType() != VT) { 7389 LLVM_DEBUG( 7390 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n"); 7391 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value); 7392 } 7393 7394 // This is actually a DUPLANExx operation, which keeps everything vectory. 7395 7396 SDValue Lane = Value.getOperand(1); 7397 Value = Value.getOperand(0); 7398 if (Value.getValueSizeInBits() == 64) { 7399 LLVM_DEBUG( 7400 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, " 7401 "widening it\n"); 7402 Value = WidenVector(Value, DAG); 7403 } 7404 7405 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType()); 7406 return DAG.getNode(Opcode, dl, VT, Value, Lane); 7407 } 7408 7409 if (VT.getVectorElementType().isFloatingPoint()) { 7410 SmallVector<SDValue, 8> Ops; 7411 EVT EltTy = VT.getVectorElementType(); 7412 assert ((EltTy == MVT::f16 || EltTy == MVT::f32 || EltTy == MVT::f64) && 7413 "Unsupported floating-point vector type"); 7414 LLVM_DEBUG( 7415 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int " 7416 "BITCASTS, and try again\n"); 7417 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits()); 7418 for (unsigned i = 0; i < NumElts; ++i) 7419 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i))); 7420 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts); 7421 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); 7422 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: "; 7423 Val.dump();); 7424 Val = LowerBUILD_VECTOR(Val, DAG); 7425 if (Val.getNode()) 7426 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 7427 } 7428 } 7429 7430 // If there was only one constant value used and for more than one lane, 7431 // start by splatting that value, then replace the non-constant lanes. This 7432 // is better than the default, which will perform a separate initialization 7433 // for each lane. 7434 if (NumConstantLanes > 0 && usesOnlyOneConstantValue) { 7435 // Firstly, try to materialize the splat constant. 7436 SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue), 7437 Val = ConstantBuildVector(Vec, DAG); 7438 if (!Val) { 7439 // Otherwise, materialize the constant and splat it. 7440 Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue); 7441 DAG.ReplaceAllUsesWith(Vec.getNode(), &Val); 7442 } 7443 7444 // Now insert the non-constant lanes. 7445 for (unsigned i = 0; i < NumElts; ++i) { 7446 SDValue V = Op.getOperand(i); 7447 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); 7448 if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) 7449 // Note that type legalization likely mucked about with the VT of the 7450 // source operand, so we may have to convert it here before inserting. 7451 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx); 7452 } 7453 return Val; 7454 } 7455 7456 // This will generate a load from the constant pool. 7457 if (isConstant) { 7458 LLVM_DEBUG( 7459 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default " 7460 "expansion\n"); 7461 return SDValue(); 7462 } 7463 7464 // Empirical tests suggest this is rarely worth it for vectors of length <= 2. 7465 if (NumElts >= 4) { 7466 if (SDValue shuffle = ReconstructShuffle(Op, DAG)) 7467 return shuffle; 7468 } 7469 7470 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we 7471 // know the default expansion would otherwise fall back on something even 7472 // worse. For a vector with one or two non-undef values, that's 7473 // scalar_to_vector for the elements followed by a shuffle (provided the 7474 // shuffle is valid for the target) and materialization element by element 7475 // on the stack followed by a load for everything else. 7476 if (!isConstant && !usesOnlyOneValue) { 7477 LLVM_DEBUG( 7478 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence " 7479 "of INSERT_VECTOR_ELT\n"); 7480 7481 SDValue Vec = DAG.getUNDEF(VT); 7482 SDValue Op0 = Op.getOperand(0); 7483 unsigned i = 0; 7484 7485 // Use SCALAR_TO_VECTOR for lane zero to 7486 // a) Avoid a RMW dependency on the full vector register, and 7487 // b) Allow the register coalescer to fold away the copy if the 7488 // value is already in an S or D register, and we're forced to emit an 7489 // INSERT_SUBREG that we can't fold anywhere. 7490 // 7491 // We also allow types like i8 and i16 which are illegal scalar but legal 7492 // vector element types. After type-legalization the inserted value is 7493 // extended (i32) and it is safe to cast them to the vector type by ignoring 7494 // the upper bits of the lowest lane (e.g. v8i8, v4i16). 7495 if (!Op0.isUndef()) { 7496 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n"); 7497 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0); 7498 ++i; 7499 } 7500 LLVM_DEBUG(if (i < NumElts) dbgs() 7501 << "Creating nodes for the other vector elements:\n";); 7502 for (; i < NumElts; ++i) { 7503 SDValue V = Op.getOperand(i); 7504 if (V.isUndef()) 7505 continue; 7506 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); 7507 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); 7508 } 7509 return Vec; 7510 } 7511 7512 LLVM_DEBUG( 7513 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find " 7514 "better alternative\n"); 7515 return SDValue(); 7516 } 7517 7518 SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 7519 SelectionDAG &DAG) const { 7520 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!"); 7521 7522 // Check for non-constant or out of range lane. 7523 EVT VT = Op.getOperand(0).getValueType(); 7524 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 7525 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) 7526 return SDValue(); 7527 7528 7529 // Insertion/extraction are legal for V128 types. 7530 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || 7531 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || 7532 VT == MVT::v8f16) 7533 return Op; 7534 7535 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && 7536 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) 7537 return SDValue(); 7538 7539 // For V64 types, we perform insertion by expanding the value 7540 // to a V128 type and perform the insertion on that. 7541 SDLoc DL(Op); 7542 SDValue WideVec = WidenVector(Op.getOperand(0), DAG); 7543 EVT WideTy = WideVec.getValueType(); 7544 7545 SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec, 7546 Op.getOperand(1), Op.getOperand(2)); 7547 // Re-narrow the resultant vector. 7548 return NarrowVector(Node, DAG); 7549 } 7550 7551 SDValue 7552 AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 7553 SelectionDAG &DAG) const { 7554 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!"); 7555 7556 // Check for non-constant or out of range lane. 7557 EVT VT = Op.getOperand(0).getValueType(); 7558 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 7559 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) 7560 return SDValue(); 7561 7562 7563 // Insertion/extraction are legal for V128 types. 7564 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || 7565 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || 7566 VT == MVT::v8f16) 7567 return Op; 7568 7569 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && 7570 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) 7571 return SDValue(); 7572 7573 // For V64 types, we perform extraction by expanding the value 7574 // to a V128 type and perform the extraction on that. 7575 SDLoc DL(Op); 7576 SDValue WideVec = WidenVector(Op.getOperand(0), DAG); 7577 EVT WideTy = WideVec.getValueType(); 7578 7579 EVT ExtrTy = WideTy.getVectorElementType(); 7580 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8) 7581 ExtrTy = MVT::i32; 7582 7583 // For extractions, we just return the result directly. 7584 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec, 7585 Op.getOperand(1)); 7586 } 7587 7588 SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, 7589 SelectionDAG &DAG) const { 7590 EVT VT = Op.getOperand(0).getValueType(); 7591 SDLoc dl(Op); 7592 // Just in case... 7593 if (!VT.isVector()) 7594 return SDValue(); 7595 7596 ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 7597 if (!Cst) 7598 return SDValue(); 7599 unsigned Val = Cst->getZExtValue(); 7600 7601 unsigned Size = Op.getValueSizeInBits(); 7602 7603 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel. 7604 if (Val == 0) 7605 return Op; 7606 7607 // If this is extracting the upper 64-bits of a 128-bit vector, we match 7608 // that directly. 7609 if (Size == 64 && Val * VT.getScalarSizeInBits() == 64) 7610 return Op; 7611 7612 return SDValue(); 7613 } 7614 7615 bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { 7616 if (VT.getVectorNumElements() == 4 && 7617 (VT.is128BitVector() || VT.is64BitVector())) { 7618 unsigned PFIndexes[4]; 7619 for (unsigned i = 0; i != 4; ++i) { 7620 if (M[i] < 0) 7621 PFIndexes[i] = 8; 7622 else 7623 PFIndexes[i] = M[i]; 7624 } 7625 7626 // Compute the index in the perfect shuffle table. 7627 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + 7628 PFIndexes[2] * 9 + PFIndexes[3]; 7629 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 7630 unsigned Cost = (PFEntry >> 30); 7631 7632 if (Cost <= 4) 7633 return true; 7634 } 7635 7636 bool DummyBool; 7637 int DummyInt; 7638 unsigned DummyUnsigned; 7639 7640 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) || 7641 isREVMask(M, VT, 32) || isREVMask(M, VT, 16) || 7642 isEXTMask(M, VT, DummyBool, DummyUnsigned) || 7643 // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM. 7644 isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) || 7645 isZIPMask(M, VT, DummyUnsigned) || 7646 isTRN_v_undef_Mask(M, VT, DummyUnsigned) || 7647 isUZP_v_undef_Mask(M, VT, DummyUnsigned) || 7648 isZIP_v_undef_Mask(M, VT, DummyUnsigned) || 7649 isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) || 7650 isConcatMask(M, VT, VT.getSizeInBits() == 128)); 7651 } 7652 7653 /// getVShiftImm - Check if this is a valid build_vector for the immediate 7654 /// operand of a vector shift operation, where all the elements of the 7655 /// build_vector must have the same constant integer value. 7656 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { 7657 // Ignore bit_converts. 7658 while (Op.getOpcode() == ISD::BITCAST) 7659 Op = Op.getOperand(0); 7660 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 7661 APInt SplatBits, SplatUndef; 7662 unsigned SplatBitSize; 7663 bool HasAnyUndefs; 7664 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, 7665 HasAnyUndefs, ElementBits) || 7666 SplatBitSize > ElementBits) 7667 return false; 7668 Cnt = SplatBits.getSExtValue(); 7669 return true; 7670 } 7671 7672 /// isVShiftLImm - Check if this is a valid build_vector for the immediate 7673 /// operand of a vector shift left operation. That value must be in the range: 7674 /// 0 <= Value < ElementBits for a left shift; or 7675 /// 0 <= Value <= ElementBits for a long left shift. 7676 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { 7677 assert(VT.isVector() && "vector shift count is not a vector type"); 7678 int64_t ElementBits = VT.getScalarSizeInBits(); 7679 if (!getVShiftImm(Op, ElementBits, Cnt)) 7680 return false; 7681 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); 7682 } 7683 7684 /// isVShiftRImm - Check if this is a valid build_vector for the immediate 7685 /// operand of a vector shift right operation. The value must be in the range: 7686 /// 1 <= Value <= ElementBits for a right shift; or 7687 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) { 7688 assert(VT.isVector() && "vector shift count is not a vector type"); 7689 int64_t ElementBits = VT.getScalarSizeInBits(); 7690 if (!getVShiftImm(Op, ElementBits, Cnt)) 7691 return false; 7692 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); 7693 } 7694 7695 SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, 7696 SelectionDAG &DAG) const { 7697 EVT VT = Op.getValueType(); 7698 SDLoc DL(Op); 7699 int64_t Cnt; 7700 7701 if (!Op.getOperand(1).getValueType().isVector()) 7702 return Op; 7703 unsigned EltSize = VT.getScalarSizeInBits(); 7704 7705 switch (Op.getOpcode()) { 7706 default: 7707 llvm_unreachable("unexpected shift opcode"); 7708 7709 case ISD::SHL: 7710 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) 7711 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0), 7712 DAG.getConstant(Cnt, DL, MVT::i32)); 7713 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 7714 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL, 7715 MVT::i32), 7716 Op.getOperand(0), Op.getOperand(1)); 7717 case ISD::SRA: 7718 case ISD::SRL: 7719 // Right shift immediate 7720 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) { 7721 unsigned Opc = 7722 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR; 7723 return DAG.getNode(Opc, DL, VT, Op.getOperand(0), 7724 DAG.getConstant(Cnt, DL, MVT::i32)); 7725 } 7726 7727 // Right shift register. Note, there is not a shift right register 7728 // instruction, but the shift left register instruction takes a signed 7729 // value, where negative numbers specify a right shift. 7730 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl 7731 : Intrinsic::aarch64_neon_ushl; 7732 // negate the shift amount 7733 SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1)); 7734 SDValue NegShiftLeft = 7735 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 7736 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0), 7737 NegShift); 7738 return NegShiftLeft; 7739 } 7740 7741 return SDValue(); 7742 } 7743 7744 static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, 7745 AArch64CC::CondCode CC, bool NoNans, EVT VT, 7746 const SDLoc &dl, SelectionDAG &DAG) { 7747 EVT SrcVT = LHS.getValueType(); 7748 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() && 7749 "function only supposed to emit natural comparisons"); 7750 7751 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode()); 7752 APInt CnstBits(VT.getSizeInBits(), 0); 7753 APInt UndefBits(VT.getSizeInBits(), 0); 7754 bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits); 7755 bool IsZero = IsCnst && (CnstBits == 0); 7756 7757 if (SrcVT.getVectorElementType().isFloatingPoint()) { 7758 switch (CC) { 7759 default: 7760 return SDValue(); 7761 case AArch64CC::NE: { 7762 SDValue Fcmeq; 7763 if (IsZero) 7764 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); 7765 else 7766 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); 7767 return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq); 7768 } 7769 case AArch64CC::EQ: 7770 if (IsZero) 7771 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); 7772 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); 7773 case AArch64CC::GE: 7774 if (IsZero) 7775 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS); 7776 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS); 7777 case AArch64CC::GT: 7778 if (IsZero) 7779 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS); 7780 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS); 7781 case AArch64CC::LS: 7782 if (IsZero) 7783 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS); 7784 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS); 7785 case AArch64CC::LT: 7786 if (!NoNans) 7787 return SDValue(); 7788 // If we ignore NaNs then we can use to the MI implementation. 7789 LLVM_FALLTHROUGH; 7790 case AArch64CC::MI: 7791 if (IsZero) 7792 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS); 7793 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS); 7794 } 7795 } 7796 7797 switch (CC) { 7798 default: 7799 return SDValue(); 7800 case AArch64CC::NE: { 7801 SDValue Cmeq; 7802 if (IsZero) 7803 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); 7804 else 7805 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); 7806 return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq); 7807 } 7808 case AArch64CC::EQ: 7809 if (IsZero) 7810 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); 7811 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); 7812 case AArch64CC::GE: 7813 if (IsZero) 7814 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS); 7815 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS); 7816 case AArch64CC::GT: 7817 if (IsZero) 7818 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS); 7819 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS); 7820 case AArch64CC::LE: 7821 if (IsZero) 7822 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS); 7823 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS); 7824 case AArch64CC::LS: 7825 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS); 7826 case AArch64CC::LO: 7827 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS); 7828 case AArch64CC::LT: 7829 if (IsZero) 7830 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS); 7831 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS); 7832 case AArch64CC::HI: 7833 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS); 7834 case AArch64CC::HS: 7835 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS); 7836 } 7837 } 7838 7839 SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, 7840 SelectionDAG &DAG) const { 7841 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 7842 SDValue LHS = Op.getOperand(0); 7843 SDValue RHS = Op.getOperand(1); 7844 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger(); 7845 SDLoc dl(Op); 7846 7847 if (LHS.getValueType().getVectorElementType().isInteger()) { 7848 assert(LHS.getValueType() == RHS.getValueType()); 7849 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); 7850 SDValue Cmp = 7851 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG); 7852 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); 7853 } 7854 7855 const bool FullFP16 = 7856 static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16(); 7857 7858 // Make v4f16 (only) fcmp operations utilise vector instructions 7859 // v8f16 support will be a litle more complicated 7860 if (!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) { 7861 if (LHS.getValueType().getVectorNumElements() == 4) { 7862 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS); 7863 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS); 7864 SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC); 7865 DAG.ReplaceAllUsesWith(Op, NewSetcc); 7866 CmpVT = MVT::v4i32; 7867 } else 7868 return SDValue(); 7869 } 7870 7871 assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) || 7872 LHS.getValueType().getVectorElementType() != MVT::f128); 7873 7874 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 7875 // clean. Some of them require two branches to implement. 7876 AArch64CC::CondCode CC1, CC2; 7877 bool ShouldInvert; 7878 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert); 7879 7880 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath; 7881 SDValue Cmp = 7882 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG); 7883 if (!Cmp.getNode()) 7884 return SDValue(); 7885 7886 if (CC2 != AArch64CC::AL) { 7887 SDValue Cmp2 = 7888 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG); 7889 if (!Cmp2.getNode()) 7890 return SDValue(); 7891 7892 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2); 7893 } 7894 7895 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); 7896 7897 if (ShouldInvert) 7898 Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType()); 7899 7900 return Cmp; 7901 } 7902 7903 static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, 7904 SelectionDAG &DAG) { 7905 SDValue VecOp = ScalarOp.getOperand(0); 7906 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp); 7907 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx, 7908 DAG.getConstant(0, DL, MVT::i64)); 7909 } 7910 7911 SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op, 7912 SelectionDAG &DAG) const { 7913 SDLoc dl(Op); 7914 switch (Op.getOpcode()) { 7915 case ISD::VECREDUCE_ADD: 7916 return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG); 7917 case ISD::VECREDUCE_SMAX: 7918 return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG); 7919 case ISD::VECREDUCE_SMIN: 7920 return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG); 7921 case ISD::VECREDUCE_UMAX: 7922 return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG); 7923 case ISD::VECREDUCE_UMIN: 7924 return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG); 7925 case ISD::VECREDUCE_FMAX: { 7926 assert(Op->getFlags().hasNoNaNs() && "fmax vector reduction needs NoNaN flag"); 7927 return DAG.getNode( 7928 ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), 7929 DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32), 7930 Op.getOperand(0)); 7931 } 7932 case ISD::VECREDUCE_FMIN: { 7933 assert(Op->getFlags().hasNoNaNs() && "fmin vector reduction needs NoNaN flag"); 7934 return DAG.getNode( 7935 ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), 7936 DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32), 7937 Op.getOperand(0)); 7938 } 7939 default: 7940 llvm_unreachable("Unhandled reduction"); 7941 } 7942 } 7943 7944 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op, 7945 SelectionDAG &DAG) const { 7946 auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget()); 7947 if (!Subtarget.hasLSE()) 7948 return SDValue(); 7949 7950 // LSE has an atomic load-add instruction, but not a load-sub. 7951 SDLoc dl(Op); 7952 MVT VT = Op.getSimpleValueType(); 7953 SDValue RHS = Op.getOperand(2); 7954 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode()); 7955 RHS = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), RHS); 7956 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, AN->getMemoryVT(), 7957 Op.getOperand(0), Op.getOperand(1), RHS, 7958 AN->getMemOperand()); 7959 } 7960 7961 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op, 7962 SelectionDAG &DAG) const { 7963 auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget()); 7964 if (!Subtarget.hasLSE()) 7965 return SDValue(); 7966 7967 // LSE has an atomic load-clear instruction, but not a load-and. 7968 SDLoc dl(Op); 7969 MVT VT = Op.getSimpleValueType(); 7970 SDValue RHS = Op.getOperand(2); 7971 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode()); 7972 RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS); 7973 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(), 7974 Op.getOperand(0), Op.getOperand(1), RHS, 7975 AN->getMemOperand()); 7976 } 7977 7978 SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC( 7979 SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const { 7980 SDLoc dl(Op); 7981 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 7982 SDValue Callee = DAG.getTargetExternalSymbol("__chkstk", PtrVT, 0); 7983 7984 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 7985 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask(); 7986 if (Subtarget->hasCustomCallingConv()) 7987 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask); 7988 7989 Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size, 7990 DAG.getConstant(4, dl, MVT::i64)); 7991 Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue()); 7992 Chain = 7993 DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue), 7994 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64), 7995 DAG.getRegisterMask(Mask), Chain.getValue(1)); 7996 // To match the actual intent better, we should read the output from X15 here 7997 // again (instead of potentially spilling it to the stack), but rereading Size 7998 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined 7999 // here. 8000 8001 Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size, 8002 DAG.getConstant(4, dl, MVT::i64)); 8003 return Chain; 8004 } 8005 8006 SDValue 8007 AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 8008 SelectionDAG &DAG) const { 8009 assert(Subtarget->isTargetWindows() && 8010 "Only Windows alloca probing supported"); 8011 SDLoc dl(Op); 8012 // Get the inputs. 8013 SDNode *Node = Op.getNode(); 8014 SDValue Chain = Op.getOperand(0); 8015 SDValue Size = Op.getOperand(1); 8016 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 8017 EVT VT = Node->getValueType(0); 8018 8019 if (DAG.getMachineFunction().getFunction().hasFnAttribute( 8020 "no-stack-arg-probe")) { 8021 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); 8022 Chain = SP.getValue(1); 8023 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); 8024 if (Align) 8025 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), 8026 DAG.getConstant(-(uint64_t)Align, dl, VT)); 8027 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); 8028 SDValue Ops[2] = {SP, Chain}; 8029 return DAG.getMergeValues(Ops, dl); 8030 } 8031 8032 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); 8033 8034 Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG); 8035 8036 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); 8037 Chain = SP.getValue(1); 8038 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); 8039 if (Align) 8040 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), 8041 DAG.getConstant(-(uint64_t)Align, dl, VT)); 8042 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); 8043 8044 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), 8045 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl); 8046 8047 SDValue Ops[2] = {SP, Chain}; 8048 return DAG.getMergeValues(Ops, dl); 8049 } 8050 8051 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as 8052 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment 8053 /// specified in the intrinsic calls. 8054 bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 8055 const CallInst &I, 8056 MachineFunction &MF, 8057 unsigned Intrinsic) const { 8058 auto &DL = I.getModule()->getDataLayout(); 8059 switch (Intrinsic) { 8060 case Intrinsic::aarch64_neon_ld2: 8061 case Intrinsic::aarch64_neon_ld3: 8062 case Intrinsic::aarch64_neon_ld4: 8063 case Intrinsic::aarch64_neon_ld1x2: 8064 case Intrinsic::aarch64_neon_ld1x3: 8065 case Intrinsic::aarch64_neon_ld1x4: 8066 case Intrinsic::aarch64_neon_ld2lane: 8067 case Intrinsic::aarch64_neon_ld3lane: 8068 case Intrinsic::aarch64_neon_ld4lane: 8069 case Intrinsic::aarch64_neon_ld2r: 8070 case Intrinsic::aarch64_neon_ld3r: 8071 case Intrinsic::aarch64_neon_ld4r: { 8072 Info.opc = ISD::INTRINSIC_W_CHAIN; 8073 // Conservatively set memVT to the entire set of vectors loaded. 8074 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 8075 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 8076 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); 8077 Info.offset = 0; 8078 Info.align = 0; 8079 // volatile loads with NEON intrinsics not supported 8080 Info.flags = MachineMemOperand::MOLoad; 8081 return true; 8082 } 8083 case Intrinsic::aarch64_neon_st2: 8084 case Intrinsic::aarch64_neon_st3: 8085 case Intrinsic::aarch64_neon_st4: 8086 case Intrinsic::aarch64_neon_st1x2: 8087 case Intrinsic::aarch64_neon_st1x3: 8088 case Intrinsic::aarch64_neon_st1x4: 8089 case Intrinsic::aarch64_neon_st2lane: 8090 case Intrinsic::aarch64_neon_st3lane: 8091 case Intrinsic::aarch64_neon_st4lane: { 8092 Info.opc = ISD::INTRINSIC_VOID; 8093 // Conservatively set memVT to the entire set of vectors stored. 8094 unsigned NumElts = 0; 8095 for (unsigned ArgI = 0, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 8096 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 8097 if (!ArgTy->isVectorTy()) 8098 break; 8099 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 8100 } 8101 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 8102 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); 8103 Info.offset = 0; 8104 Info.align = 0; 8105 // volatile stores with NEON intrinsics not supported 8106 Info.flags = MachineMemOperand::MOStore; 8107 return true; 8108 } 8109 case Intrinsic::aarch64_ldaxr: 8110 case Intrinsic::aarch64_ldxr: { 8111 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); 8112 Info.opc = ISD::INTRINSIC_W_CHAIN; 8113 Info.memVT = MVT::getVT(PtrTy->getElementType()); 8114 Info.ptrVal = I.getArgOperand(0); 8115 Info.offset = 0; 8116 Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); 8117 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 8118 return true; 8119 } 8120 case Intrinsic::aarch64_stlxr: 8121 case Intrinsic::aarch64_stxr: { 8122 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); 8123 Info.opc = ISD::INTRINSIC_W_CHAIN; 8124 Info.memVT = MVT::getVT(PtrTy->getElementType()); 8125 Info.ptrVal = I.getArgOperand(1); 8126 Info.offset = 0; 8127 Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); 8128 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 8129 return true; 8130 } 8131 case Intrinsic::aarch64_ldaxp: 8132 case Intrinsic::aarch64_ldxp: 8133 Info.opc = ISD::INTRINSIC_W_CHAIN; 8134 Info.memVT = MVT::i128; 8135 Info.ptrVal = I.getArgOperand(0); 8136 Info.offset = 0; 8137 Info.align = 16; 8138 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 8139 return true; 8140 case Intrinsic::aarch64_stlxp: 8141 case Intrinsic::aarch64_stxp: 8142 Info.opc = ISD::INTRINSIC_W_CHAIN; 8143 Info.memVT = MVT::i128; 8144 Info.ptrVal = I.getArgOperand(2); 8145 Info.offset = 0; 8146 Info.align = 16; 8147 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 8148 return true; 8149 default: 8150 break; 8151 } 8152 8153 return false; 8154 } 8155 8156 bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load, 8157 ISD::LoadExtType ExtTy, 8158 EVT NewVT) const { 8159 // TODO: This may be worth removing. Check regression tests for diffs. 8160 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT)) 8161 return false; 8162 8163 // If we're reducing the load width in order to avoid having to use an extra 8164 // instruction to do extension then it's probably a good idea. 8165 if (ExtTy != ISD::NON_EXTLOAD) 8166 return true; 8167 // Don't reduce load width if it would prevent us from combining a shift into 8168 // the offset. 8169 MemSDNode *Mem = dyn_cast<MemSDNode>(Load); 8170 assert(Mem); 8171 const SDValue &Base = Mem->getBasePtr(); 8172 if (Base.getOpcode() == ISD::ADD && 8173 Base.getOperand(1).getOpcode() == ISD::SHL && 8174 Base.getOperand(1).hasOneUse() && 8175 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) { 8176 // The shift can be combined if it matches the size of the value being 8177 // loaded (and so reducing the width would make it not match). 8178 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1); 8179 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8; 8180 if (ShiftAmount == Log2_32(LoadBytes)) 8181 return false; 8182 } 8183 // We have no reason to disallow reducing the load width, so allow it. 8184 return true; 8185 } 8186 8187 // Truncations from 64-bit GPR to 32-bit GPR is free. 8188 bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 8189 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 8190 return false; 8191 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 8192 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 8193 return NumBits1 > NumBits2; 8194 } 8195 bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 8196 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) 8197 return false; 8198 unsigned NumBits1 = VT1.getSizeInBits(); 8199 unsigned NumBits2 = VT2.getSizeInBits(); 8200 return NumBits1 > NumBits2; 8201 } 8202 8203 /// Check if it is profitable to hoist instruction in then/else to if. 8204 /// Not profitable if I and it's user can form a FMA instruction 8205 /// because we prefer FMSUB/FMADD. 8206 bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { 8207 if (I->getOpcode() != Instruction::FMul) 8208 return true; 8209 8210 if (!I->hasOneUse()) 8211 return true; 8212 8213 Instruction *User = I->user_back(); 8214 8215 if (User && 8216 !(User->getOpcode() == Instruction::FSub || 8217 User->getOpcode() == Instruction::FAdd)) 8218 return true; 8219 8220 const TargetOptions &Options = getTargetMachine().Options; 8221 const DataLayout &DL = I->getModule()->getDataLayout(); 8222 EVT VT = getValueType(DL, User->getOperand(0)->getType()); 8223 8224 return !(isFMAFasterThanFMulAndFAdd(VT) && 8225 isOperationLegalOrCustom(ISD::FMA, VT) && 8226 (Options.AllowFPOpFusion == FPOpFusion::Fast || 8227 Options.UnsafeFPMath)); 8228 } 8229 8230 // All 32-bit GPR operations implicitly zero the high-half of the corresponding 8231 // 64-bit GPR. 8232 bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { 8233 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 8234 return false; 8235 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 8236 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 8237 return NumBits1 == 32 && NumBits2 == 64; 8238 } 8239 bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 8240 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) 8241 return false; 8242 unsigned NumBits1 = VT1.getSizeInBits(); 8243 unsigned NumBits2 = VT2.getSizeInBits(); 8244 return NumBits1 == 32 && NumBits2 == 64; 8245 } 8246 8247 bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 8248 EVT VT1 = Val.getValueType(); 8249 if (isZExtFree(VT1, VT2)) { 8250 return true; 8251 } 8252 8253 if (Val.getOpcode() != ISD::LOAD) 8254 return false; 8255 8256 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend. 8257 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() && 8258 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() && 8259 VT1.getSizeInBits() <= 32); 8260 } 8261 8262 bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { 8263 if (isa<FPExtInst>(Ext)) 8264 return false; 8265 8266 // Vector types are not free. 8267 if (Ext->getType()->isVectorTy()) 8268 return false; 8269 8270 for (const Use &U : Ext->uses()) { 8271 // The extension is free if we can fold it with a left shift in an 8272 // addressing mode or an arithmetic operation: add, sub, and cmp. 8273 8274 // Is there a shift? 8275 const Instruction *Instr = cast<Instruction>(U.getUser()); 8276 8277 // Is this a constant shift? 8278 switch (Instr->getOpcode()) { 8279 case Instruction::Shl: 8280 if (!isa<ConstantInt>(Instr->getOperand(1))) 8281 return false; 8282 break; 8283 case Instruction::GetElementPtr: { 8284 gep_type_iterator GTI = gep_type_begin(Instr); 8285 auto &DL = Ext->getModule()->getDataLayout(); 8286 std::advance(GTI, U.getOperandNo()-1); 8287 Type *IdxTy = GTI.getIndexedType(); 8288 // This extension will end up with a shift because of the scaling factor. 8289 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0. 8290 // Get the shift amount based on the scaling factor: 8291 // log2(sizeof(IdxTy)) - log2(8). 8292 uint64_t ShiftAmt = 8293 countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy)) - 3; 8294 // Is the constant foldable in the shift of the addressing mode? 8295 // I.e., shift amount is between 1 and 4 inclusive. 8296 if (ShiftAmt == 0 || ShiftAmt > 4) 8297 return false; 8298 break; 8299 } 8300 case Instruction::Trunc: 8301 // Check if this is a noop. 8302 // trunc(sext ty1 to ty2) to ty1. 8303 if (Instr->getType() == Ext->getOperand(0)->getType()) 8304 continue; 8305 LLVM_FALLTHROUGH; 8306 default: 8307 return false; 8308 } 8309 8310 // At this point we can use the bfm family, so this extension is free 8311 // for that use. 8312 } 8313 return true; 8314 } 8315 8316 /// Check if both Op1 and Op2 are shufflevector extracts of either the lower 8317 /// or upper half of the vector elements. 8318 static bool areExtractShuffleVectors(Value *Op1, Value *Op2) { 8319 auto areTypesHalfed = [](Value *FullV, Value *HalfV) { 8320 auto *FullVT = cast<VectorType>(FullV->getType()); 8321 auto *HalfVT = cast<VectorType>(HalfV->getType()); 8322 return FullVT->getBitWidth() == 2 * HalfVT->getBitWidth(); 8323 }; 8324 8325 auto extractHalf = [](Value *FullV, Value *HalfV) { 8326 auto *FullVT = cast<VectorType>(FullV->getType()); 8327 auto *HalfVT = cast<VectorType>(HalfV->getType()); 8328 return FullVT->getNumElements() == 2 * HalfVT->getNumElements(); 8329 }; 8330 8331 Constant *M1, *M2; 8332 Value *S1Op1, *S2Op1; 8333 if (!match(Op1, m_ShuffleVector(m_Value(S1Op1), m_Undef(), m_Constant(M1))) || 8334 !match(Op2, m_ShuffleVector(m_Value(S2Op1), m_Undef(), m_Constant(M2)))) 8335 return false; 8336 8337 // Check that the operands are half as wide as the result and we extract 8338 // half of the elements of the input vectors. 8339 if (!areTypesHalfed(S1Op1, Op1) || !areTypesHalfed(S2Op1, Op2) || 8340 !extractHalf(S1Op1, Op1) || !extractHalf(S2Op1, Op2)) 8341 return false; 8342 8343 // Check the mask extracts either the lower or upper half of vector 8344 // elements. 8345 int M1Start = -1; 8346 int M2Start = -1; 8347 int NumElements = cast<VectorType>(Op1->getType())->getNumElements() * 2; 8348 if (!ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start) || 8349 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start) || 8350 M1Start != M2Start || (M1Start != 0 && M2Start != (NumElements / 2))) 8351 return false; 8352 8353 return true; 8354 } 8355 8356 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth 8357 /// of the vector elements. 8358 static bool areExtractExts(Value *Ext1, Value *Ext2) { 8359 auto areExtDoubled = [](Instruction *Ext) { 8360 return Ext->getType()->getScalarSizeInBits() == 8361 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); 8362 }; 8363 8364 if (!match(Ext1, m_ZExtOrSExt(m_Value())) || 8365 !match(Ext2, m_ZExtOrSExt(m_Value())) || 8366 !areExtDoubled(cast<Instruction>(Ext1)) || 8367 !areExtDoubled(cast<Instruction>(Ext2))) 8368 return false; 8369 8370 return true; 8371 } 8372 8373 /// Check if sinking \p I's operands to I's basic block is profitable, because 8374 /// the operands can be folded into a target instruction, e.g. 8375 /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2). 8376 bool AArch64TargetLowering::shouldSinkOperands( 8377 Instruction *I, SmallVectorImpl<Use *> &Ops) const { 8378 if (!I->getType()->isVectorTy()) 8379 return false; 8380 8381 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 8382 switch (II->getIntrinsicID()) { 8383 case Intrinsic::aarch64_neon_umull: 8384 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) 8385 return false; 8386 Ops.push_back(&II->getOperandUse(0)); 8387 Ops.push_back(&II->getOperandUse(1)); 8388 return true; 8389 default: 8390 return false; 8391 } 8392 } 8393 8394 switch (I->getOpcode()) { 8395 case Instruction::Sub: 8396 case Instruction::Add: { 8397 if (!areExtractExts(I->getOperand(0), I->getOperand(1))) 8398 return false; 8399 8400 // If the exts' operands extract either the lower or upper elements, we 8401 // can sink them too. 8402 auto Ext1 = cast<Instruction>(I->getOperand(0)); 8403 auto Ext2 = cast<Instruction>(I->getOperand(1)); 8404 if (areExtractShuffleVectors(Ext1, Ext2)) { 8405 Ops.push_back(&Ext1->getOperandUse(0)); 8406 Ops.push_back(&Ext2->getOperandUse(0)); 8407 } 8408 8409 Ops.push_back(&I->getOperandUse(0)); 8410 Ops.push_back(&I->getOperandUse(1)); 8411 8412 return true; 8413 } 8414 default: 8415 return false; 8416 } 8417 return false; 8418 } 8419 8420 bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, 8421 unsigned &RequiredAligment) const { 8422 if (!LoadedType.isSimple() || 8423 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint())) 8424 return false; 8425 // Cyclone supports unaligned accesses. 8426 RequiredAligment = 0; 8427 unsigned NumBits = LoadedType.getSizeInBits(); 8428 return NumBits == 32 || NumBits == 64; 8429 } 8430 8431 /// A helper function for determining the number of interleaved accesses we 8432 /// will generate when lowering accesses of the given type. 8433 unsigned 8434 AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy, 8435 const DataLayout &DL) const { 8436 return (DL.getTypeSizeInBits(VecTy) + 127) / 128; 8437 } 8438 8439 MachineMemOperand::Flags 8440 AArch64TargetLowering::getMMOFlags(const Instruction &I) const { 8441 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor && 8442 I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr) 8443 return MOStridedAccess; 8444 return MachineMemOperand::MONone; 8445 } 8446 8447 bool AArch64TargetLowering::isLegalInterleavedAccessType( 8448 VectorType *VecTy, const DataLayout &DL) const { 8449 8450 unsigned VecSize = DL.getTypeSizeInBits(VecTy); 8451 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); 8452 8453 // Ensure the number of vector elements is greater than 1. 8454 if (VecTy->getNumElements() < 2) 8455 return false; 8456 8457 // Ensure the element type is legal. 8458 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64) 8459 return false; 8460 8461 // Ensure the total vector size is 64 or a multiple of 128. Types larger than 8462 // 128 will be split into multiple interleaved accesses. 8463 return VecSize == 64 || VecSize % 128 == 0; 8464 } 8465 8466 /// Lower an interleaved load into a ldN intrinsic. 8467 /// 8468 /// E.g. Lower an interleaved load (Factor = 2): 8469 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr 8470 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements 8471 /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements 8472 /// 8473 /// Into: 8474 /// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr) 8475 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 8476 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 8477 bool AArch64TargetLowering::lowerInterleavedLoad( 8478 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, 8479 ArrayRef<unsigned> Indices, unsigned Factor) const { 8480 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 8481 "Invalid interleave factor"); 8482 assert(!Shuffles.empty() && "Empty shufflevector input"); 8483 assert(Shuffles.size() == Indices.size() && 8484 "Unmatched number of shufflevectors and indices"); 8485 8486 const DataLayout &DL = LI->getModule()->getDataLayout(); 8487 8488 VectorType *VecTy = Shuffles[0]->getType(); 8489 8490 // Skip if we do not have NEON and skip illegal vector types. We can 8491 // "legalize" wide vector types into multiple interleaved accesses as long as 8492 // the vector types are divisible by 128. 8493 if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL)) 8494 return false; 8495 8496 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); 8497 8498 // A pointer vector can not be the return type of the ldN intrinsics. Need to 8499 // load integer vectors first and then convert to pointer vectors. 8500 Type *EltTy = VecTy->getVectorElementType(); 8501 if (EltTy->isPointerTy()) 8502 VecTy = 8503 VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); 8504 8505 IRBuilder<> Builder(LI); 8506 8507 // The base address of the load. 8508 Value *BaseAddr = LI->getPointerOperand(); 8509 8510 if (NumLoads > 1) { 8511 // If we're going to generate more than one load, reset the sub-vector type 8512 // to something legal. 8513 VecTy = VectorType::get(VecTy->getVectorElementType(), 8514 VecTy->getVectorNumElements() / NumLoads); 8515 8516 // We will compute the pointer operand of each load from the original base 8517 // address using GEPs. Cast the base address to a pointer to the scalar 8518 // element type. 8519 BaseAddr = Builder.CreateBitCast( 8520 BaseAddr, VecTy->getVectorElementType()->getPointerTo( 8521 LI->getPointerAddressSpace())); 8522 } 8523 8524 Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace()); 8525 Type *Tys[2] = {VecTy, PtrTy}; 8526 static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2, 8527 Intrinsic::aarch64_neon_ld3, 8528 Intrinsic::aarch64_neon_ld4}; 8529 Function *LdNFunc = 8530 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); 8531 8532 // Holds sub-vectors extracted from the load intrinsic return values. The 8533 // sub-vectors are associated with the shufflevector instructions they will 8534 // replace. 8535 DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs; 8536 8537 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { 8538 8539 // If we're generating more than one load, compute the base address of 8540 // subsequent loads as an offset from the previous. 8541 if (LoadCount > 0) 8542 BaseAddr = 8543 Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr, 8544 VecTy->getVectorNumElements() * Factor); 8545 8546 CallInst *LdN = Builder.CreateCall( 8547 LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN"); 8548 8549 // Extract and store the sub-vectors returned by the load intrinsic. 8550 for (unsigned i = 0; i < Shuffles.size(); i++) { 8551 ShuffleVectorInst *SVI = Shuffles[i]; 8552 unsigned Index = Indices[i]; 8553 8554 Value *SubVec = Builder.CreateExtractValue(LdN, Index); 8555 8556 // Convert the integer vector to pointer vector if the element is pointer. 8557 if (EltTy->isPointerTy()) 8558 SubVec = Builder.CreateIntToPtr( 8559 SubVec, VectorType::get(SVI->getType()->getVectorElementType(), 8560 VecTy->getVectorNumElements())); 8561 SubVecs[SVI].push_back(SubVec); 8562 } 8563 } 8564 8565 // Replace uses of the shufflevector instructions with the sub-vectors 8566 // returned by the load intrinsic. If a shufflevector instruction is 8567 // associated with more than one sub-vector, those sub-vectors will be 8568 // concatenated into a single wide vector. 8569 for (ShuffleVectorInst *SVI : Shuffles) { 8570 auto &SubVec = SubVecs[SVI]; 8571 auto *WideVec = 8572 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; 8573 SVI->replaceAllUsesWith(WideVec); 8574 } 8575 8576 return true; 8577 } 8578 8579 /// Lower an interleaved store into a stN intrinsic. 8580 /// 8581 /// E.g. Lower an interleaved store (Factor = 3): 8582 /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, 8583 /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> 8584 /// store <12 x i32> %i.vec, <12 x i32>* %ptr 8585 /// 8586 /// Into: 8587 /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> 8588 /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> 8589 /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> 8590 /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr) 8591 /// 8592 /// Note that the new shufflevectors will be removed and we'll only generate one 8593 /// st3 instruction in CodeGen. 8594 /// 8595 /// Example for a more general valid mask (Factor 3). Lower: 8596 /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1, 8597 /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> 8598 /// store <12 x i32> %i.vec, <12 x i32>* %ptr 8599 /// 8600 /// Into: 8601 /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7> 8602 /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> 8603 /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> 8604 /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr) 8605 bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, 8606 ShuffleVectorInst *SVI, 8607 unsigned Factor) const { 8608 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 8609 "Invalid interleave factor"); 8610 8611 VectorType *VecTy = SVI->getType(); 8612 assert(VecTy->getVectorNumElements() % Factor == 0 && 8613 "Invalid interleaved store"); 8614 8615 unsigned LaneLen = VecTy->getVectorNumElements() / Factor; 8616 Type *EltTy = VecTy->getVectorElementType(); 8617 VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); 8618 8619 const DataLayout &DL = SI->getModule()->getDataLayout(); 8620 8621 // Skip if we do not have NEON and skip illegal vector types. We can 8622 // "legalize" wide vector types into multiple interleaved accesses as long as 8623 // the vector types are divisible by 128. 8624 if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL)) 8625 return false; 8626 8627 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); 8628 8629 Value *Op0 = SVI->getOperand(0); 8630 Value *Op1 = SVI->getOperand(1); 8631 IRBuilder<> Builder(SI); 8632 8633 // StN intrinsics don't support pointer vectors as arguments. Convert pointer 8634 // vectors to integer vectors. 8635 if (EltTy->isPointerTy()) { 8636 Type *IntTy = DL.getIntPtrType(EltTy); 8637 unsigned NumOpElts = Op0->getType()->getVectorNumElements(); 8638 8639 // Convert to the corresponding integer vector. 8640 Type *IntVecTy = VectorType::get(IntTy, NumOpElts); 8641 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); 8642 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); 8643 8644 SubVecTy = VectorType::get(IntTy, LaneLen); 8645 } 8646 8647 // The base address of the store. 8648 Value *BaseAddr = SI->getPointerOperand(); 8649 8650 if (NumStores > 1) { 8651 // If we're going to generate more than one store, reset the lane length 8652 // and sub-vector type to something legal. 8653 LaneLen /= NumStores; 8654 SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen); 8655 8656 // We will compute the pointer operand of each store from the original base 8657 // address using GEPs. Cast the base address to a pointer to the scalar 8658 // element type. 8659 BaseAddr = Builder.CreateBitCast( 8660 BaseAddr, SubVecTy->getVectorElementType()->getPointerTo( 8661 SI->getPointerAddressSpace())); 8662 } 8663 8664 auto Mask = SVI->getShuffleMask(); 8665 8666 Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace()); 8667 Type *Tys[2] = {SubVecTy, PtrTy}; 8668 static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2, 8669 Intrinsic::aarch64_neon_st3, 8670 Intrinsic::aarch64_neon_st4}; 8671 Function *StNFunc = 8672 Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); 8673 8674 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { 8675 8676 SmallVector<Value *, 5> Ops; 8677 8678 // Split the shufflevector operands into sub vectors for the new stN call. 8679 for (unsigned i = 0; i < Factor; i++) { 8680 unsigned IdxI = StoreCount * LaneLen * Factor + i; 8681 if (Mask[IdxI] >= 0) { 8682 Ops.push_back(Builder.CreateShuffleVector( 8683 Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); 8684 } else { 8685 unsigned StartMask = 0; 8686 for (unsigned j = 1; j < LaneLen; j++) { 8687 unsigned IdxJ = StoreCount * LaneLen * Factor + j; 8688 if (Mask[IdxJ * Factor + IdxI] >= 0) { 8689 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ; 8690 break; 8691 } 8692 } 8693 // Note: Filling undef gaps with random elements is ok, since 8694 // those elements were being written anyway (with undefs). 8695 // In the case of all undefs we're defaulting to using elems from 0 8696 // Note: StartMask cannot be negative, it's checked in 8697 // isReInterleaveMask 8698 Ops.push_back(Builder.CreateShuffleVector( 8699 Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); 8700 } 8701 } 8702 8703 // If we generating more than one store, we compute the base address of 8704 // subsequent stores as an offset from the previous. 8705 if (StoreCount > 0) 8706 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(), 8707 BaseAddr, LaneLen * Factor); 8708 8709 Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy)); 8710 Builder.CreateCall(StNFunc, Ops); 8711 } 8712 return true; 8713 } 8714 8715 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, 8716 unsigned AlignCheck) { 8717 return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && 8718 (DstAlign == 0 || DstAlign % AlignCheck == 0)); 8719 } 8720 8721 EVT AArch64TargetLowering::getOptimalMemOpType( 8722 uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, 8723 bool ZeroMemset, bool MemcpyStrSrc, 8724 const AttributeList &FuncAttributes) const { 8725 bool CanImplicitFloat = 8726 !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat); 8727 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat; 8728 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat; 8729 // Only use AdvSIMD to implement memset of 32-byte and above. It would have 8730 // taken one instruction to materialize the v2i64 zero and one store (with 8731 // restrictive addressing mode). Just do i64 stores. 8732 bool IsSmallMemset = IsMemset && Size < 32; 8733 auto AlignmentIsAcceptable = [&](EVT VT, unsigned AlignCheck) { 8734 if (memOpAlign(SrcAlign, DstAlign, AlignCheck)) 8735 return true; 8736 bool Fast; 8737 return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone, 8738 &Fast) && 8739 Fast; 8740 }; 8741 8742 if (CanUseNEON && IsMemset && !IsSmallMemset && 8743 AlignmentIsAcceptable(MVT::v2i64, 16)) 8744 return MVT::v2i64; 8745 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, 16)) 8746 return MVT::f128; 8747 if (Size >= 8 && AlignmentIsAcceptable(MVT::i64, 8)) 8748 return MVT::i64; 8749 if (Size >= 4 && AlignmentIsAcceptable(MVT::i32, 4)) 8750 return MVT::i32; 8751 return MVT::Other; 8752 } 8753 8754 // 12-bit optionally shifted immediates are legal for adds. 8755 bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const { 8756 if (Immed == std::numeric_limits<int64_t>::min()) { 8757 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed 8758 << ": avoid UB for INT64_MIN\n"); 8759 return false; 8760 } 8761 // Same encoding for add/sub, just flip the sign. 8762 Immed = std::abs(Immed); 8763 bool IsLegal = ((Immed >> 12) == 0 || 8764 ((Immed & 0xfff) == 0 && Immed >> 24 == 0)); 8765 LLVM_DEBUG(dbgs() << "Is " << Immed 8766 << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n"); 8767 return IsLegal; 8768 } 8769 8770 // Integer comparisons are implemented with ADDS/SUBS, so the range of valid 8771 // immediates is the same as for an add or a sub. 8772 bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const { 8773 return isLegalAddImmediate(Immed); 8774 } 8775 8776 /// isLegalAddressingMode - Return true if the addressing mode represented 8777 /// by AM is legal for this target, for a load/store of the specified type. 8778 bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL, 8779 const AddrMode &AM, Type *Ty, 8780 unsigned AS, Instruction *I) const { 8781 // AArch64 has five basic addressing modes: 8782 // reg 8783 // reg + 9-bit signed offset 8784 // reg + SIZE_IN_BYTES * 12-bit unsigned offset 8785 // reg1 + reg2 8786 // reg + SIZE_IN_BYTES * reg 8787 8788 // No global is ever allowed as a base. 8789 if (AM.BaseGV) 8790 return false; 8791 8792 // No reg+reg+imm addressing. 8793 if (AM.HasBaseReg && AM.BaseOffs && AM.Scale) 8794 return false; 8795 8796 // check reg + imm case: 8797 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12 8798 uint64_t NumBytes = 0; 8799 if (Ty->isSized()) { 8800 uint64_t NumBits = DL.getTypeSizeInBits(Ty); 8801 NumBytes = NumBits / 8; 8802 if (!isPowerOf2_64(NumBits)) 8803 NumBytes = 0; 8804 } 8805 8806 if (!AM.Scale) { 8807 int64_t Offset = AM.BaseOffs; 8808 8809 // 9-bit signed offset 8810 if (isInt<9>(Offset)) 8811 return true; 8812 8813 // 12-bit unsigned offset 8814 unsigned shift = Log2_64(NumBytes); 8815 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 && 8816 // Must be a multiple of NumBytes (NumBytes is a power of 2) 8817 (Offset >> shift) << shift == Offset) 8818 return true; 8819 return false; 8820 } 8821 8822 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2 8823 8824 return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes); 8825 } 8826 8827 bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const { 8828 // Consider splitting large offset of struct or array. 8829 return true; 8830 } 8831 8832 int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL, 8833 const AddrMode &AM, Type *Ty, 8834 unsigned AS) const { 8835 // Scaling factors are not free at all. 8836 // Operands | Rt Latency 8837 // ------------------------------------------- 8838 // Rt, [Xn, Xm] | 4 8839 // ------------------------------------------- 8840 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 8841 // Rt, [Xn, Wm, <extend> #imm] | 8842 if (isLegalAddressingMode(DL, AM, Ty, AS)) 8843 // Scale represents reg2 * scale, thus account for 1 if 8844 // it is not equal to 0 or 1. 8845 return AM.Scale != 0 && AM.Scale != 1; 8846 return -1; 8847 } 8848 8849 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 8850 VT = VT.getScalarType(); 8851 8852 if (!VT.isSimple()) 8853 return false; 8854 8855 switch (VT.getSimpleVT().SimpleTy) { 8856 case MVT::f32: 8857 case MVT::f64: 8858 return true; 8859 default: 8860 break; 8861 } 8862 8863 return false; 8864 } 8865 8866 const MCPhysReg * 8867 AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const { 8868 // LR is a callee-save register, but we must treat it as clobbered by any call 8869 // site. Hence we include LR in the scratch registers, which are in turn added 8870 // as implicit-defs for stackmaps and patchpoints. 8871 static const MCPhysReg ScratchRegs[] = { 8872 AArch64::X16, AArch64::X17, AArch64::LR, 0 8873 }; 8874 return ScratchRegs; 8875 } 8876 8877 bool 8878 AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N, 8879 CombineLevel Level) const { 8880 N = N->getOperand(0).getNode(); 8881 EVT VT = N->getValueType(0); 8882 // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine 8883 // it with shift to let it be lowered to UBFX. 8884 if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) && 8885 isa<ConstantSDNode>(N->getOperand(1))) { 8886 uint64_t TruncMask = N->getConstantOperandVal(1); 8887 if (isMask_64(TruncMask) && 8888 N->getOperand(0).getOpcode() == ISD::SRL && 8889 isa<ConstantSDNode>(N->getOperand(0)->getOperand(1))) 8890 return false; 8891 } 8892 return true; 8893 } 8894 8895 bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 8896 Type *Ty) const { 8897 assert(Ty->isIntegerTy()); 8898 8899 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 8900 if (BitSize == 0) 8901 return false; 8902 8903 int64_t Val = Imm.getSExtValue(); 8904 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize)) 8905 return true; 8906 8907 if ((int64_t)Val < 0) 8908 Val = ~Val; 8909 if (BitSize == 32) 8910 Val &= (1LL << 32) - 1; 8911 8912 unsigned LZ = countLeadingZeros((uint64_t)Val); 8913 unsigned Shift = (63 - LZ) / 16; 8914 // MOVZ is free so return true for one or fewer MOVK. 8915 return Shift < 3; 8916 } 8917 8918 bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, 8919 unsigned Index) const { 8920 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) 8921 return false; 8922 8923 return (Index == 0 || Index == ResVT.getVectorNumElements()); 8924 } 8925 8926 /// Turn vector tests of the signbit in the form of: 8927 /// xor (sra X, elt_size(X)-1), -1 8928 /// into: 8929 /// cmge X, X, #0 8930 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, 8931 const AArch64Subtarget *Subtarget) { 8932 EVT VT = N->getValueType(0); 8933 if (!Subtarget->hasNEON() || !VT.isVector()) 8934 return SDValue(); 8935 8936 // There must be a shift right algebraic before the xor, and the xor must be a 8937 // 'not' operation. 8938 SDValue Shift = N->getOperand(0); 8939 SDValue Ones = N->getOperand(1); 8940 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() || 8941 !ISD::isBuildVectorAllOnes(Ones.getNode())) 8942 return SDValue(); 8943 8944 // The shift should be smearing the sign bit across each vector element. 8945 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); 8946 EVT ShiftEltTy = Shift.getValueType().getVectorElementType(); 8947 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1) 8948 return SDValue(); 8949 8950 return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0)); 8951 } 8952 8953 // Generate SUBS and CSEL for integer abs. 8954 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { 8955 EVT VT = N->getValueType(0); 8956 8957 SDValue N0 = N->getOperand(0); 8958 SDValue N1 = N->getOperand(1); 8959 SDLoc DL(N); 8960 8961 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) 8962 // and change it to SUB and CSEL. 8963 if (VT.isInteger() && N->getOpcode() == ISD::XOR && 8964 N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 && 8965 N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) 8966 if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1))) 8967 if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) { 8968 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), 8969 N0.getOperand(0)); 8970 // Generate SUBS & CSEL. 8971 SDValue Cmp = 8972 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32), 8973 N0.getOperand(0), DAG.getConstant(0, DL, VT)); 8974 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg, 8975 DAG.getConstant(AArch64CC::PL, DL, MVT::i32), 8976 SDValue(Cmp.getNode(), 1)); 8977 } 8978 return SDValue(); 8979 } 8980 8981 static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, 8982 TargetLowering::DAGCombinerInfo &DCI, 8983 const AArch64Subtarget *Subtarget) { 8984 if (DCI.isBeforeLegalizeOps()) 8985 return SDValue(); 8986 8987 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget)) 8988 return Cmp; 8989 8990 return performIntegerAbsCombine(N, DAG); 8991 } 8992 8993 SDValue 8994 AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, 8995 SelectionDAG &DAG, 8996 SmallVectorImpl<SDNode *> &Created) const { 8997 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); 8998 if (isIntDivCheap(N->getValueType(0), Attr)) 8999 return SDValue(N,0); // Lower SDIV as SDIV 9000 9001 // fold (sdiv X, pow2) 9002 EVT VT = N->getValueType(0); 9003 if ((VT != MVT::i32 && VT != MVT::i64) || 9004 !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2())) 9005 return SDValue(); 9006 9007 SDLoc DL(N); 9008 SDValue N0 = N->getOperand(0); 9009 unsigned Lg2 = Divisor.countTrailingZeros(); 9010 SDValue Zero = DAG.getConstant(0, DL, VT); 9011 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT); 9012 9013 // Add (N0 < 0) ? Pow2 - 1 : 0; 9014 SDValue CCVal; 9015 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL); 9016 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne); 9017 SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp); 9018 9019 Created.push_back(Cmp.getNode()); 9020 Created.push_back(Add.getNode()); 9021 Created.push_back(CSel.getNode()); 9022 9023 // Divide by pow2. 9024 SDValue SRA = 9025 DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64)); 9026 9027 // If we're dividing by a positive value, we're done. Otherwise, we must 9028 // negate the result. 9029 if (Divisor.isNonNegative()) 9030 return SRA; 9031 9032 Created.push_back(SRA.getNode()); 9033 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA); 9034 } 9035 9036 static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, 9037 TargetLowering::DAGCombinerInfo &DCI, 9038 const AArch64Subtarget *Subtarget) { 9039 if (DCI.isBeforeLegalizeOps()) 9040 return SDValue(); 9041 9042 // The below optimizations require a constant RHS. 9043 if (!isa<ConstantSDNode>(N->getOperand(1))) 9044 return SDValue(); 9045 9046 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1)); 9047 const APInt &ConstValue = C->getAPIntValue(); 9048 9049 // Multiplication of a power of two plus/minus one can be done more 9050 // cheaply as as shift+add/sub. For now, this is true unilaterally. If 9051 // future CPUs have a cheaper MADD instruction, this may need to be 9052 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and 9053 // 64-bit is 5 cycles, so this is always a win. 9054 // More aggressively, some multiplications N0 * C can be lowered to 9055 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M, 9056 // e.g. 6=3*2=(2+1)*2. 9057 // TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45 9058 // which equals to (1+2)*16-(1+2). 9059 SDValue N0 = N->getOperand(0); 9060 // TrailingZeroes is used to test if the mul can be lowered to 9061 // shift+add+shift. 9062 unsigned TrailingZeroes = ConstValue.countTrailingZeros(); 9063 if (TrailingZeroes) { 9064 // Conservatively do not lower to shift+add+shift if the mul might be 9065 // folded into smul or umul. 9066 if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) || 9067 isZeroExtended(N0.getNode(), DAG))) 9068 return SDValue(); 9069 // Conservatively do not lower to shift+add+shift if the mul might be 9070 // folded into madd or msub. 9071 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD || 9072 N->use_begin()->getOpcode() == ISD::SUB)) 9073 return SDValue(); 9074 } 9075 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub 9076 // and shift+add+shift. 9077 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes); 9078 9079 unsigned ShiftAmt, AddSubOpc; 9080 // Is the shifted value the LHS operand of the add/sub? 9081 bool ShiftValUseIsN0 = true; 9082 // Do we need to negate the result? 9083 bool NegateResult = false; 9084 9085 if (ConstValue.isNonNegative()) { 9086 // (mul x, 2^N + 1) => (add (shl x, N), x) 9087 // (mul x, 2^N - 1) => (sub (shl x, N), x) 9088 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M) 9089 APInt SCVMinus1 = ShiftedConstValue - 1; 9090 APInt CVPlus1 = ConstValue + 1; 9091 if (SCVMinus1.isPowerOf2()) { 9092 ShiftAmt = SCVMinus1.logBase2(); 9093 AddSubOpc = ISD::ADD; 9094 } else if (CVPlus1.isPowerOf2()) { 9095 ShiftAmt = CVPlus1.logBase2(); 9096 AddSubOpc = ISD::SUB; 9097 } else 9098 return SDValue(); 9099 } else { 9100 // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) 9101 // (mul x, -(2^N + 1)) => - (add (shl x, N), x) 9102 APInt CVNegPlus1 = -ConstValue + 1; 9103 APInt CVNegMinus1 = -ConstValue - 1; 9104 if (CVNegPlus1.isPowerOf2()) { 9105 ShiftAmt = CVNegPlus1.logBase2(); 9106 AddSubOpc = ISD::SUB; 9107 ShiftValUseIsN0 = false; 9108 } else if (CVNegMinus1.isPowerOf2()) { 9109 ShiftAmt = CVNegMinus1.logBase2(); 9110 AddSubOpc = ISD::ADD; 9111 NegateResult = true; 9112 } else 9113 return SDValue(); 9114 } 9115 9116 SDLoc DL(N); 9117 EVT VT = N->getValueType(0); 9118 SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0, 9119 DAG.getConstant(ShiftAmt, DL, MVT::i64)); 9120 9121 SDValue AddSubN0 = ShiftValUseIsN0 ? ShiftedVal : N0; 9122 SDValue AddSubN1 = ShiftValUseIsN0 ? N0 : ShiftedVal; 9123 SDValue Res = DAG.getNode(AddSubOpc, DL, VT, AddSubN0, AddSubN1); 9124 assert(!(NegateResult && TrailingZeroes) && 9125 "NegateResult and TrailingZeroes cannot both be true for now."); 9126 // Negate the result. 9127 if (NegateResult) 9128 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res); 9129 // Shift the result. 9130 if (TrailingZeroes) 9131 return DAG.getNode(ISD::SHL, DL, VT, Res, 9132 DAG.getConstant(TrailingZeroes, DL, MVT::i64)); 9133 return Res; 9134 } 9135 9136 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, 9137 SelectionDAG &DAG) { 9138 // Take advantage of vector comparisons producing 0 or -1 in each lane to 9139 // optimize away operation when it's from a constant. 9140 // 9141 // The general transformation is: 9142 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> 9143 // AND(VECTOR_CMP(x,y), constant2) 9144 // constant2 = UNARYOP(constant) 9145 9146 // Early exit if this isn't a vector operation, the operand of the 9147 // unary operation isn't a bitwise AND, or if the sizes of the operations 9148 // aren't the same. 9149 EVT VT = N->getValueType(0); 9150 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || 9151 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || 9152 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits()) 9153 return SDValue(); 9154 9155 // Now check that the other operand of the AND is a constant. We could 9156 // make the transformation for non-constant splats as well, but it's unclear 9157 // that would be a benefit as it would not eliminate any operations, just 9158 // perform one more step in scalar code before moving to the vector unit. 9159 if (BuildVectorSDNode *BV = 9160 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) { 9161 // Bail out if the vector isn't a constant. 9162 if (!BV->isConstant()) 9163 return SDValue(); 9164 9165 // Everything checks out. Build up the new and improved node. 9166 SDLoc DL(N); 9167 EVT IntVT = BV->getValueType(0); 9168 // Create a new constant of the appropriate type for the transformed 9169 // DAG. 9170 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); 9171 // The AND node needs bitcasts to/from an integer vector type around it. 9172 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst); 9173 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, 9174 N->getOperand(0)->getOperand(0), MaskConst); 9175 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd); 9176 return Res; 9177 } 9178 9179 return SDValue(); 9180 } 9181 9182 static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, 9183 const AArch64Subtarget *Subtarget) { 9184 // First try to optimize away the conversion when it's conditionally from 9185 // a constant. Vectors only. 9186 if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG)) 9187 return Res; 9188 9189 EVT VT = N->getValueType(0); 9190 if (VT != MVT::f32 && VT != MVT::f64) 9191 return SDValue(); 9192 9193 // Only optimize when the source and destination types have the same width. 9194 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits()) 9195 return SDValue(); 9196 9197 // If the result of an integer load is only used by an integer-to-float 9198 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead. 9199 // This eliminates an "integer-to-vector-move" UOP and improves throughput. 9200 SDValue N0 = N->getOperand(0); 9201 if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && 9202 // Do not change the width of a volatile load. 9203 !cast<LoadSDNode>(N0)->isVolatile()) { 9204 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 9205 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), 9206 LN0->getPointerInfo(), LN0->getAlignment(), 9207 LN0->getMemOperand()->getFlags()); 9208 9209 // Make sure successors of the original load stay after it by updating them 9210 // to use the new Chain. 9211 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1)); 9212 9213 unsigned Opcode = 9214 (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF; 9215 return DAG.getNode(Opcode, SDLoc(N), VT, Load); 9216 } 9217 9218 return SDValue(); 9219 } 9220 9221 /// Fold a floating-point multiply by power of two into floating-point to 9222 /// fixed-point conversion. 9223 static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, 9224 TargetLowering::DAGCombinerInfo &DCI, 9225 const AArch64Subtarget *Subtarget) { 9226 if (!Subtarget->hasNEON()) 9227 return SDValue(); 9228 9229 if (!N->getValueType(0).isSimple()) 9230 return SDValue(); 9231 9232 SDValue Op = N->getOperand(0); 9233 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || 9234 Op.getOpcode() != ISD::FMUL) 9235 return SDValue(); 9236 9237 SDValue ConstVec = Op->getOperand(1); 9238 if (!isa<BuildVectorSDNode>(ConstVec)) 9239 return SDValue(); 9240 9241 MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); 9242 uint32_t FloatBits = FloatTy.getSizeInBits(); 9243 if (FloatBits != 32 && FloatBits != 64) 9244 return SDValue(); 9245 9246 MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); 9247 uint32_t IntBits = IntTy.getSizeInBits(); 9248 if (IntBits != 16 && IntBits != 32 && IntBits != 64) 9249 return SDValue(); 9250 9251 // Avoid conversions where iN is larger than the float (e.g., float -> i64). 9252 if (IntBits > FloatBits) 9253 return SDValue(); 9254 9255 BitVector UndefElements; 9256 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 9257 int32_t Bits = IntBits == 64 ? 64 : 32; 9258 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1); 9259 if (C == -1 || C == 0 || C > Bits) 9260 return SDValue(); 9261 9262 MVT ResTy; 9263 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 9264 switch (NumLanes) { 9265 default: 9266 return SDValue(); 9267 case 2: 9268 ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; 9269 break; 9270 case 4: 9271 ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64; 9272 break; 9273 } 9274 9275 if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps()) 9276 return SDValue(); 9277 9278 assert((ResTy != MVT::v4i64 || DCI.isBeforeLegalizeOps()) && 9279 "Illegal vector type after legalization"); 9280 9281 SDLoc DL(N); 9282 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; 9283 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs 9284 : Intrinsic::aarch64_neon_vcvtfp2fxu; 9285 SDValue FixConv = 9286 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy, 9287 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), 9288 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32)); 9289 // We can handle smaller integers by generating an extra trunc. 9290 if (IntBits < FloatBits) 9291 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv); 9292 9293 return FixConv; 9294 } 9295 9296 /// Fold a floating-point divide by power of two into fixed-point to 9297 /// floating-point conversion. 9298 static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG, 9299 TargetLowering::DAGCombinerInfo &DCI, 9300 const AArch64Subtarget *Subtarget) { 9301 if (!Subtarget->hasNEON()) 9302 return SDValue(); 9303 9304 SDValue Op = N->getOperand(0); 9305 unsigned Opc = Op->getOpcode(); 9306 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || 9307 !Op.getOperand(0).getValueType().isSimple() || 9308 (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP)) 9309 return SDValue(); 9310 9311 SDValue ConstVec = N->getOperand(1); 9312 if (!isa<BuildVectorSDNode>(ConstVec)) 9313 return SDValue(); 9314 9315 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); 9316 int32_t IntBits = IntTy.getSizeInBits(); 9317 if (IntBits != 16 && IntBits != 32 && IntBits != 64) 9318 return SDValue(); 9319 9320 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); 9321 int32_t FloatBits = FloatTy.getSizeInBits(); 9322 if (FloatBits != 32 && FloatBits != 64) 9323 return SDValue(); 9324 9325 // Avoid conversions where iN is larger than the float (e.g., i64 -> float). 9326 if (IntBits > FloatBits) 9327 return SDValue(); 9328 9329 BitVector UndefElements; 9330 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 9331 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1); 9332 if (C == -1 || C == 0 || C > FloatBits) 9333 return SDValue(); 9334 9335 MVT ResTy; 9336 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 9337 switch (NumLanes) { 9338 default: 9339 return SDValue(); 9340 case 2: 9341 ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; 9342 break; 9343 case 4: 9344 ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64; 9345 break; 9346 } 9347 9348 if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps()) 9349 return SDValue(); 9350 9351 SDLoc DL(N); 9352 SDValue ConvInput = Op.getOperand(0); 9353 bool IsSigned = Opc == ISD::SINT_TO_FP; 9354 if (IntBits < FloatBits) 9355 ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, 9356 ResTy, ConvInput); 9357 9358 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp 9359 : Intrinsic::aarch64_neon_vcvtfxu2fp; 9360 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), 9361 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput, 9362 DAG.getConstant(C, DL, MVT::i32)); 9363 } 9364 9365 /// An EXTR instruction is made up of two shifts, ORed together. This helper 9366 /// searches for and classifies those shifts. 9367 static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, 9368 bool &FromHi) { 9369 if (N.getOpcode() == ISD::SHL) 9370 FromHi = false; 9371 else if (N.getOpcode() == ISD::SRL) 9372 FromHi = true; 9373 else 9374 return false; 9375 9376 if (!isa<ConstantSDNode>(N.getOperand(1))) 9377 return false; 9378 9379 ShiftAmount = N->getConstantOperandVal(1); 9380 Src = N->getOperand(0); 9381 return true; 9382 } 9383 9384 /// EXTR instruction extracts a contiguous chunk of bits from two existing 9385 /// registers viewed as a high/low pair. This function looks for the pattern: 9386 /// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it 9387 /// with an EXTR. Can't quite be done in TableGen because the two immediates 9388 /// aren't independent. 9389 static SDValue tryCombineToEXTR(SDNode *N, 9390 TargetLowering::DAGCombinerInfo &DCI) { 9391 SelectionDAG &DAG = DCI.DAG; 9392 SDLoc DL(N); 9393 EVT VT = N->getValueType(0); 9394 9395 assert(N->getOpcode() == ISD::OR && "Unexpected root"); 9396 9397 if (VT != MVT::i32 && VT != MVT::i64) 9398 return SDValue(); 9399 9400 SDValue LHS; 9401 uint32_t ShiftLHS = 0; 9402 bool LHSFromHi = false; 9403 if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi)) 9404 return SDValue(); 9405 9406 SDValue RHS; 9407 uint32_t ShiftRHS = 0; 9408 bool RHSFromHi = false; 9409 if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi)) 9410 return SDValue(); 9411 9412 // If they're both trying to come from the high part of the register, they're 9413 // not really an EXTR. 9414 if (LHSFromHi == RHSFromHi) 9415 return SDValue(); 9416 9417 if (ShiftLHS + ShiftRHS != VT.getSizeInBits()) 9418 return SDValue(); 9419 9420 if (LHSFromHi) { 9421 std::swap(LHS, RHS); 9422 std::swap(ShiftLHS, ShiftRHS); 9423 } 9424 9425 return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS, 9426 DAG.getConstant(ShiftRHS, DL, MVT::i64)); 9427 } 9428 9429 static SDValue tryCombineToBSL(SDNode *N, 9430 TargetLowering::DAGCombinerInfo &DCI) { 9431 EVT VT = N->getValueType(0); 9432 SelectionDAG &DAG = DCI.DAG; 9433 SDLoc DL(N); 9434 9435 if (!VT.isVector()) 9436 return SDValue(); 9437 9438 SDValue N0 = N->getOperand(0); 9439 if (N0.getOpcode() != ISD::AND) 9440 return SDValue(); 9441 9442 SDValue N1 = N->getOperand(1); 9443 if (N1.getOpcode() != ISD::AND) 9444 return SDValue(); 9445 9446 // We only have to look for constant vectors here since the general, variable 9447 // case can be handled in TableGen. 9448 unsigned Bits = VT.getScalarSizeInBits(); 9449 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1); 9450 for (int i = 1; i >= 0; --i) 9451 for (int j = 1; j >= 0; --j) { 9452 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i)); 9453 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j)); 9454 if (!BVN0 || !BVN1) 9455 continue; 9456 9457 bool FoundMatch = true; 9458 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) { 9459 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k)); 9460 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k)); 9461 if (!CN0 || !CN1 || 9462 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) { 9463 FoundMatch = false; 9464 break; 9465 } 9466 } 9467 9468 if (FoundMatch) 9469 return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0), 9470 N0->getOperand(1 - i), N1->getOperand(1 - j)); 9471 } 9472 9473 return SDValue(); 9474 } 9475 9476 static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 9477 const AArch64Subtarget *Subtarget) { 9478 // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) 9479 SelectionDAG &DAG = DCI.DAG; 9480 EVT VT = N->getValueType(0); 9481 9482 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 9483 return SDValue(); 9484 9485 if (SDValue Res = tryCombineToEXTR(N, DCI)) 9486 return Res; 9487 9488 if (SDValue Res = tryCombineToBSL(N, DCI)) 9489 return Res; 9490 9491 return SDValue(); 9492 } 9493 9494 static SDValue performANDCombine(SDNode *N, 9495 TargetLowering::DAGCombinerInfo &DCI) { 9496 SelectionDAG &DAG = DCI.DAG; 9497 SDValue LHS = N->getOperand(0); 9498 EVT VT = N->getValueType(0); 9499 if (!VT.isVector() || !DAG.getTargetLoweringInfo().isTypeLegal(VT)) 9500 return SDValue(); 9501 9502 BuildVectorSDNode *BVN = 9503 dyn_cast<BuildVectorSDNode>(N->getOperand(1).getNode()); 9504 if (!BVN) 9505 return SDValue(); 9506 9507 // AND does not accept an immediate, so check if we can use a BIC immediate 9508 // instruction instead. We do this here instead of using a (and x, (mvni imm)) 9509 // pattern in isel, because some immediates may be lowered to the preferred 9510 // (and x, (movi imm)) form, even though an mvni representation also exists. 9511 APInt DefBits(VT.getSizeInBits(), 0); 9512 APInt UndefBits(VT.getSizeInBits(), 0); 9513 if (resolveBuildVector(BVN, DefBits, UndefBits)) { 9514 SDValue NewOp; 9515 9516 DefBits = ~DefBits; 9517 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG, 9518 DefBits, &LHS)) || 9519 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG, 9520 DefBits, &LHS))) 9521 return NewOp; 9522 9523 UndefBits = ~UndefBits; 9524 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG, 9525 UndefBits, &LHS)) || 9526 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG, 9527 UndefBits, &LHS))) 9528 return NewOp; 9529 } 9530 9531 return SDValue(); 9532 } 9533 9534 static SDValue performSRLCombine(SDNode *N, 9535 TargetLowering::DAGCombinerInfo &DCI) { 9536 SelectionDAG &DAG = DCI.DAG; 9537 EVT VT = N->getValueType(0); 9538 if (VT != MVT::i32 && VT != MVT::i64) 9539 return SDValue(); 9540 9541 // Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the 9542 // high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32) 9543 // to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero. 9544 SDValue N0 = N->getOperand(0); 9545 if (N0.getOpcode() == ISD::BSWAP) { 9546 SDLoc DL(N); 9547 SDValue N1 = N->getOperand(1); 9548 SDValue N00 = N0.getOperand(0); 9549 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { 9550 uint64_t ShiftAmt = C->getZExtValue(); 9551 if (VT == MVT::i32 && ShiftAmt == 16 && 9552 DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16))) 9553 return DAG.getNode(ISD::ROTR, DL, VT, N0, N1); 9554 if (VT == MVT::i64 && ShiftAmt == 32 && 9555 DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32))) 9556 return DAG.getNode(ISD::ROTR, DL, VT, N0, N1); 9557 } 9558 } 9559 return SDValue(); 9560 } 9561 9562 static SDValue performBitcastCombine(SDNode *N, 9563 TargetLowering::DAGCombinerInfo &DCI, 9564 SelectionDAG &DAG) { 9565 // Wait 'til after everything is legalized to try this. That way we have 9566 // legal vector types and such. 9567 if (DCI.isBeforeLegalizeOps()) 9568 return SDValue(); 9569 9570 // Remove extraneous bitcasts around an extract_subvector. 9571 // For example, 9572 // (v4i16 (bitconvert 9573 // (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1))))) 9574 // becomes 9575 // (extract_subvector ((v8i16 ...), (i64 4))) 9576 9577 // Only interested in 64-bit vectors as the ultimate result. 9578 EVT VT = N->getValueType(0); 9579 if (!VT.isVector()) 9580 return SDValue(); 9581 if (VT.getSimpleVT().getSizeInBits() != 64) 9582 return SDValue(); 9583 // Is the operand an extract_subvector starting at the beginning or halfway 9584 // point of the vector? A low half may also come through as an 9585 // EXTRACT_SUBREG, so look for that, too. 9586 SDValue Op0 = N->getOperand(0); 9587 if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR && 9588 !(Op0->isMachineOpcode() && 9589 Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG)) 9590 return SDValue(); 9591 uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue(); 9592 if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) { 9593 if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0) 9594 return SDValue(); 9595 } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) { 9596 if (idx != AArch64::dsub) 9597 return SDValue(); 9598 // The dsub reference is equivalent to a lane zero subvector reference. 9599 idx = 0; 9600 } 9601 // Look through the bitcast of the input to the extract. 9602 if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST) 9603 return SDValue(); 9604 SDValue Source = Op0->getOperand(0)->getOperand(0); 9605 // If the source type has twice the number of elements as our destination 9606 // type, we know this is an extract of the high or low half of the vector. 9607 EVT SVT = Source->getValueType(0); 9608 if (!SVT.isVector() || 9609 SVT.getVectorNumElements() != VT.getVectorNumElements() * 2) 9610 return SDValue(); 9611 9612 LLVM_DEBUG( 9613 dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n"); 9614 9615 // Create the simplified form to just extract the low or high half of the 9616 // vector directly rather than bothering with the bitcasts. 9617 SDLoc dl(N); 9618 unsigned NumElements = VT.getVectorNumElements(); 9619 if (idx) { 9620 SDValue HalfIdx = DAG.getConstant(NumElements, dl, MVT::i64); 9621 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx); 9622 } else { 9623 SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, dl, MVT::i32); 9624 return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT, 9625 Source, SubReg), 9626 0); 9627 } 9628 } 9629 9630 static SDValue performConcatVectorsCombine(SDNode *N, 9631 TargetLowering::DAGCombinerInfo &DCI, 9632 SelectionDAG &DAG) { 9633 SDLoc dl(N); 9634 EVT VT = N->getValueType(0); 9635 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); 9636 9637 // Optimize concat_vectors of truncated vectors, where the intermediate 9638 // type is illegal, to avoid said illegality, e.g., 9639 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))), 9640 // (v2i16 (truncate (v2i64))))) 9641 // -> 9642 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))), 9643 // (v4i32 (bitcast (v2i64))), 9644 // <0, 2, 4, 6>))) 9645 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed 9646 // on both input and result type, so we might generate worse code. 9647 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8. 9648 if (N->getNumOperands() == 2 && 9649 N0->getOpcode() == ISD::TRUNCATE && 9650 N1->getOpcode() == ISD::TRUNCATE) { 9651 SDValue N00 = N0->getOperand(0); 9652 SDValue N10 = N1->getOperand(0); 9653 EVT N00VT = N00.getValueType(); 9654 9655 if (N00VT == N10.getValueType() && 9656 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) && 9657 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) { 9658 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16); 9659 SmallVector<int, 8> Mask(MidVT.getVectorNumElements()); 9660 for (size_t i = 0; i < Mask.size(); ++i) 9661 Mask[i] = i * 2; 9662 return DAG.getNode(ISD::TRUNCATE, dl, VT, 9663 DAG.getVectorShuffle( 9664 MidVT, dl, 9665 DAG.getNode(ISD::BITCAST, dl, MidVT, N00), 9666 DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask)); 9667 } 9668 } 9669 9670 // Wait 'til after everything is legalized to try this. That way we have 9671 // legal vector types and such. 9672 if (DCI.isBeforeLegalizeOps()) 9673 return SDValue(); 9674 9675 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector 9676 // splat. The indexed instructions are going to be expecting a DUPLANE64, so 9677 // canonicalise to that. 9678 if (N0 == N1 && VT.getVectorNumElements() == 2) { 9679 assert(VT.getScalarSizeInBits() == 64); 9680 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG), 9681 DAG.getConstant(0, dl, MVT::i64)); 9682 } 9683 9684 // Canonicalise concat_vectors so that the right-hand vector has as few 9685 // bit-casts as possible before its real operation. The primary matching 9686 // destination for these operations will be the narrowing "2" instructions, 9687 // which depend on the operation being performed on this right-hand vector. 9688 // For example, 9689 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS)))) 9690 // becomes 9691 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS)) 9692 9693 if (N1->getOpcode() != ISD::BITCAST) 9694 return SDValue(); 9695 SDValue RHS = N1->getOperand(0); 9696 MVT RHSTy = RHS.getValueType().getSimpleVT(); 9697 // If the RHS is not a vector, this is not the pattern we're looking for. 9698 if (!RHSTy.isVector()) 9699 return SDValue(); 9700 9701 LLVM_DEBUG( 9702 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n"); 9703 9704 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(), 9705 RHSTy.getVectorNumElements() * 2); 9706 return DAG.getNode(ISD::BITCAST, dl, VT, 9707 DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy, 9708 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0), 9709 RHS)); 9710 } 9711 9712 static SDValue tryCombineFixedPointConvert(SDNode *N, 9713 TargetLowering::DAGCombinerInfo &DCI, 9714 SelectionDAG &DAG) { 9715 // Wait until after everything is legalized to try this. That way we have 9716 // legal vector types and such. 9717 if (DCI.isBeforeLegalizeOps()) 9718 return SDValue(); 9719 // Transform a scalar conversion of a value from a lane extract into a 9720 // lane extract of a vector conversion. E.g., from foo1 to foo2: 9721 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); } 9722 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; } 9723 // 9724 // The second form interacts better with instruction selection and the 9725 // register allocator to avoid cross-class register copies that aren't 9726 // coalescable due to a lane reference. 9727 9728 // Check the operand and see if it originates from a lane extract. 9729 SDValue Op1 = N->getOperand(1); 9730 if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 9731 // Yep, no additional predication needed. Perform the transform. 9732 SDValue IID = N->getOperand(0); 9733 SDValue Shift = N->getOperand(2); 9734 SDValue Vec = Op1.getOperand(0); 9735 SDValue Lane = Op1.getOperand(1); 9736 EVT ResTy = N->getValueType(0); 9737 EVT VecResTy; 9738 SDLoc DL(N); 9739 9740 // The vector width should be 128 bits by the time we get here, even 9741 // if it started as 64 bits (the extract_vector handling will have 9742 // done so). 9743 assert(Vec.getValueSizeInBits() == 128 && 9744 "unexpected vector size on extract_vector_elt!"); 9745 if (Vec.getValueType() == MVT::v4i32) 9746 VecResTy = MVT::v4f32; 9747 else if (Vec.getValueType() == MVT::v2i64) 9748 VecResTy = MVT::v2f64; 9749 else 9750 llvm_unreachable("unexpected vector type!"); 9751 9752 SDValue Convert = 9753 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift); 9754 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane); 9755 } 9756 return SDValue(); 9757 } 9758 9759 // AArch64 high-vector "long" operations are formed by performing the non-high 9760 // version on an extract_subvector of each operand which gets the high half: 9761 // 9762 // (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS)) 9763 // 9764 // However, there are cases which don't have an extract_high explicitly, but 9765 // have another operation that can be made compatible with one for free. For 9766 // example: 9767 // 9768 // (dupv64 scalar) --> (extract_high (dup128 scalar)) 9769 // 9770 // This routine does the actual conversion of such DUPs, once outer routines 9771 // have determined that everything else is in order. 9772 // It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold 9773 // similarly here. 9774 static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) { 9775 switch (N.getOpcode()) { 9776 case AArch64ISD::DUP: 9777 case AArch64ISD::DUPLANE8: 9778 case AArch64ISD::DUPLANE16: 9779 case AArch64ISD::DUPLANE32: 9780 case AArch64ISD::DUPLANE64: 9781 case AArch64ISD::MOVI: 9782 case AArch64ISD::MOVIshift: 9783 case AArch64ISD::MOVIedit: 9784 case AArch64ISD::MOVImsl: 9785 case AArch64ISD::MVNIshift: 9786 case AArch64ISD::MVNImsl: 9787 break; 9788 default: 9789 // FMOV could be supported, but isn't very useful, as it would only occur 9790 // if you passed a bitcast' floating point immediate to an eligible long 9791 // integer op (addl, smull, ...). 9792 return SDValue(); 9793 } 9794 9795 MVT NarrowTy = N.getSimpleValueType(); 9796 if (!NarrowTy.is64BitVector()) 9797 return SDValue(); 9798 9799 MVT ElementTy = NarrowTy.getVectorElementType(); 9800 unsigned NumElems = NarrowTy.getVectorNumElements(); 9801 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2); 9802 9803 SDLoc dl(N); 9804 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy, 9805 DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()), 9806 DAG.getConstant(NumElems, dl, MVT::i64)); 9807 } 9808 9809 static bool isEssentiallyExtractHighSubvector(SDValue N) { 9810 if (N.getOpcode() == ISD::BITCAST) 9811 N = N.getOperand(0); 9812 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR) 9813 return false; 9814 return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() == 9815 N.getOperand(0).getValueType().getVectorNumElements() / 2; 9816 } 9817 9818 /// Helper structure to keep track of ISD::SET_CC operands. 9819 struct GenericSetCCInfo { 9820 const SDValue *Opnd0; 9821 const SDValue *Opnd1; 9822 ISD::CondCode CC; 9823 }; 9824 9825 /// Helper structure to keep track of a SET_CC lowered into AArch64 code. 9826 struct AArch64SetCCInfo { 9827 const SDValue *Cmp; 9828 AArch64CC::CondCode CC; 9829 }; 9830 9831 /// Helper structure to keep track of SetCC information. 9832 union SetCCInfo { 9833 GenericSetCCInfo Generic; 9834 AArch64SetCCInfo AArch64; 9835 }; 9836 9837 /// Helper structure to be able to read SetCC information. If set to 9838 /// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a 9839 /// GenericSetCCInfo. 9840 struct SetCCInfoAndKind { 9841 SetCCInfo Info; 9842 bool IsAArch64; 9843 }; 9844 9845 /// Check whether or not \p Op is a SET_CC operation, either a generic or 9846 /// an 9847 /// AArch64 lowered one. 9848 /// \p SetCCInfo is filled accordingly. 9849 /// \post SetCCInfo is meanginfull only when this function returns true. 9850 /// \return True when Op is a kind of SET_CC operation. 9851 static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) { 9852 // If this is a setcc, this is straight forward. 9853 if (Op.getOpcode() == ISD::SETCC) { 9854 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0); 9855 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1); 9856 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 9857 SetCCInfo.IsAArch64 = false; 9858 return true; 9859 } 9860 // Otherwise, check if this is a matching csel instruction. 9861 // In other words: 9862 // - csel 1, 0, cc 9863 // - csel 0, 1, !cc 9864 if (Op.getOpcode() != AArch64ISD::CSEL) 9865 return false; 9866 // Set the information about the operands. 9867 // TODO: we want the operands of the Cmp not the csel 9868 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3); 9869 SetCCInfo.IsAArch64 = true; 9870 SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>( 9871 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); 9872 9873 // Check that the operands matches the constraints: 9874 // (1) Both operands must be constants. 9875 // (2) One must be 1 and the other must be 0. 9876 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0)); 9877 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 9878 9879 // Check (1). 9880 if (!TValue || !FValue) 9881 return false; 9882 9883 // Check (2). 9884 if (!TValue->isOne()) { 9885 // Update the comparison when we are interested in !cc. 9886 std::swap(TValue, FValue); 9887 SetCCInfo.Info.AArch64.CC = 9888 AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC); 9889 } 9890 return TValue->isOne() && FValue->isNullValue(); 9891 } 9892 9893 // Returns true if Op is setcc or zext of setcc. 9894 static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) { 9895 if (isSetCC(Op, Info)) 9896 return true; 9897 return ((Op.getOpcode() == ISD::ZERO_EXTEND) && 9898 isSetCC(Op->getOperand(0), Info)); 9899 } 9900 9901 // The folding we want to perform is: 9902 // (add x, [zext] (setcc cc ...) ) 9903 // --> 9904 // (csel x, (add x, 1), !cc ...) 9905 // 9906 // The latter will get matched to a CSINC instruction. 9907 static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) { 9908 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!"); 9909 SDValue LHS = Op->getOperand(0); 9910 SDValue RHS = Op->getOperand(1); 9911 SetCCInfoAndKind InfoAndKind; 9912 9913 // If neither operand is a SET_CC, give up. 9914 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) { 9915 std::swap(LHS, RHS); 9916 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) 9917 return SDValue(); 9918 } 9919 9920 // FIXME: This could be generatized to work for FP comparisons. 9921 EVT CmpVT = InfoAndKind.IsAArch64 9922 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType() 9923 : InfoAndKind.Info.Generic.Opnd0->getValueType(); 9924 if (CmpVT != MVT::i32 && CmpVT != MVT::i64) 9925 return SDValue(); 9926 9927 SDValue CCVal; 9928 SDValue Cmp; 9929 SDLoc dl(Op); 9930 if (InfoAndKind.IsAArch64) { 9931 CCVal = DAG.getConstant( 9932 AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl, 9933 MVT::i32); 9934 Cmp = *InfoAndKind.Info.AArch64.Cmp; 9935 } else 9936 Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0, 9937 *InfoAndKind.Info.Generic.Opnd1, 9938 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true), 9939 CCVal, DAG, dl); 9940 9941 EVT VT = Op->getValueType(0); 9942 LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT)); 9943 return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp); 9944 } 9945 9946 // The basic add/sub long vector instructions have variants with "2" on the end 9947 // which act on the high-half of their inputs. They are normally matched by 9948 // patterns like: 9949 // 9950 // (add (zeroext (extract_high LHS)), 9951 // (zeroext (extract_high RHS))) 9952 // -> uaddl2 vD, vN, vM 9953 // 9954 // However, if one of the extracts is something like a duplicate, this 9955 // instruction can still be used profitably. This function puts the DAG into a 9956 // more appropriate form for those patterns to trigger. 9957 static SDValue performAddSubLongCombine(SDNode *N, 9958 TargetLowering::DAGCombinerInfo &DCI, 9959 SelectionDAG &DAG) { 9960 if (DCI.isBeforeLegalizeOps()) 9961 return SDValue(); 9962 9963 MVT VT = N->getSimpleValueType(0); 9964 if (!VT.is128BitVector()) { 9965 if (N->getOpcode() == ISD::ADD) 9966 return performSetccAddFolding(N, DAG); 9967 return SDValue(); 9968 } 9969 9970 // Make sure both branches are extended in the same way. 9971 SDValue LHS = N->getOperand(0); 9972 SDValue RHS = N->getOperand(1); 9973 if ((LHS.getOpcode() != ISD::ZERO_EXTEND && 9974 LHS.getOpcode() != ISD::SIGN_EXTEND) || 9975 LHS.getOpcode() != RHS.getOpcode()) 9976 return SDValue(); 9977 9978 unsigned ExtType = LHS.getOpcode(); 9979 9980 // It's not worth doing if at least one of the inputs isn't already an 9981 // extract, but we don't know which it'll be so we have to try both. 9982 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) { 9983 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG); 9984 if (!RHS.getNode()) 9985 return SDValue(); 9986 9987 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS); 9988 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) { 9989 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG); 9990 if (!LHS.getNode()) 9991 return SDValue(); 9992 9993 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS); 9994 } 9995 9996 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS); 9997 } 9998 9999 // Massage DAGs which we can use the high-half "long" operations on into 10000 // something isel will recognize better. E.g. 10001 // 10002 // (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) --> 10003 // (aarch64_neon_umull (extract_high (v2i64 vec))) 10004 // (extract_high (v2i64 (dup128 scalar))))) 10005 // 10006 static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, 10007 TargetLowering::DAGCombinerInfo &DCI, 10008 SelectionDAG &DAG) { 10009 if (DCI.isBeforeLegalizeOps()) 10010 return SDValue(); 10011 10012 SDValue LHS = N->getOperand(1); 10013 SDValue RHS = N->getOperand(2); 10014 assert(LHS.getValueType().is64BitVector() && 10015 RHS.getValueType().is64BitVector() && 10016 "unexpected shape for long operation"); 10017 10018 // Either node could be a DUP, but it's not worth doing both of them (you'd 10019 // just as well use the non-high version) so look for a corresponding extract 10020 // operation on the other "wing". 10021 if (isEssentiallyExtractHighSubvector(LHS)) { 10022 RHS = tryExtendDUPToExtractHigh(RHS, DAG); 10023 if (!RHS.getNode()) 10024 return SDValue(); 10025 } else if (isEssentiallyExtractHighSubvector(RHS)) { 10026 LHS = tryExtendDUPToExtractHigh(LHS, DAG); 10027 if (!LHS.getNode()) 10028 return SDValue(); 10029 } 10030 10031 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0), 10032 N->getOperand(0), LHS, RHS); 10033 } 10034 10035 static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) { 10036 MVT ElemTy = N->getSimpleValueType(0).getScalarType(); 10037 unsigned ElemBits = ElemTy.getSizeInBits(); 10038 10039 int64_t ShiftAmount; 10040 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) { 10041 APInt SplatValue, SplatUndef; 10042 unsigned SplatBitSize; 10043 bool HasAnyUndefs; 10044 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, 10045 HasAnyUndefs, ElemBits) || 10046 SplatBitSize != ElemBits) 10047 return SDValue(); 10048 10049 ShiftAmount = SplatValue.getSExtValue(); 10050 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) { 10051 ShiftAmount = CVN->getSExtValue(); 10052 } else 10053 return SDValue(); 10054 10055 unsigned Opcode; 10056 bool IsRightShift; 10057 switch (IID) { 10058 default: 10059 llvm_unreachable("Unknown shift intrinsic"); 10060 case Intrinsic::aarch64_neon_sqshl: 10061 Opcode = AArch64ISD::SQSHL_I; 10062 IsRightShift = false; 10063 break; 10064 case Intrinsic::aarch64_neon_uqshl: 10065 Opcode = AArch64ISD::UQSHL_I; 10066 IsRightShift = false; 10067 break; 10068 case Intrinsic::aarch64_neon_srshl: 10069 Opcode = AArch64ISD::SRSHR_I; 10070 IsRightShift = true; 10071 break; 10072 case Intrinsic::aarch64_neon_urshl: 10073 Opcode = AArch64ISD::URSHR_I; 10074 IsRightShift = true; 10075 break; 10076 case Intrinsic::aarch64_neon_sqshlu: 10077 Opcode = AArch64ISD::SQSHLU_I; 10078 IsRightShift = false; 10079 break; 10080 } 10081 10082 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) { 10083 SDLoc dl(N); 10084 return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1), 10085 DAG.getConstant(-ShiftAmount, dl, MVT::i32)); 10086 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) { 10087 SDLoc dl(N); 10088 return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1), 10089 DAG.getConstant(ShiftAmount, dl, MVT::i32)); 10090 } 10091 10092 return SDValue(); 10093 } 10094 10095 // The CRC32[BH] instructions ignore the high bits of their data operand. Since 10096 // the intrinsics must be legal and take an i32, this means there's almost 10097 // certainly going to be a zext in the DAG which we can eliminate. 10098 static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) { 10099 SDValue AndN = N->getOperand(2); 10100 if (AndN.getOpcode() != ISD::AND) 10101 return SDValue(); 10102 10103 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1)); 10104 if (!CMask || CMask->getZExtValue() != Mask) 10105 return SDValue(); 10106 10107 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32, 10108 N->getOperand(0), N->getOperand(1), AndN.getOperand(0)); 10109 } 10110 10111 static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, 10112 SelectionDAG &DAG) { 10113 SDLoc dl(N); 10114 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), 10115 DAG.getNode(Opc, dl, 10116 N->getOperand(1).getSimpleValueType(), 10117 N->getOperand(1)), 10118 DAG.getConstant(0, dl, MVT::i64)); 10119 } 10120 10121 static SDValue performIntrinsicCombine(SDNode *N, 10122 TargetLowering::DAGCombinerInfo &DCI, 10123 const AArch64Subtarget *Subtarget) { 10124 SelectionDAG &DAG = DCI.DAG; 10125 unsigned IID = getIntrinsicID(N); 10126 switch (IID) { 10127 default: 10128 break; 10129 case Intrinsic::aarch64_neon_vcvtfxs2fp: 10130 case Intrinsic::aarch64_neon_vcvtfxu2fp: 10131 return tryCombineFixedPointConvert(N, DCI, DAG); 10132 case Intrinsic::aarch64_neon_saddv: 10133 return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG); 10134 case Intrinsic::aarch64_neon_uaddv: 10135 return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG); 10136 case Intrinsic::aarch64_neon_sminv: 10137 return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG); 10138 case Intrinsic::aarch64_neon_uminv: 10139 return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG); 10140 case Intrinsic::aarch64_neon_smaxv: 10141 return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG); 10142 case Intrinsic::aarch64_neon_umaxv: 10143 return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG); 10144 case Intrinsic::aarch64_neon_fmax: 10145 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0), 10146 N->getOperand(1), N->getOperand(2)); 10147 case Intrinsic::aarch64_neon_fmin: 10148 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0), 10149 N->getOperand(1), N->getOperand(2)); 10150 case Intrinsic::aarch64_neon_fmaxnm: 10151 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0), 10152 N->getOperand(1), N->getOperand(2)); 10153 case Intrinsic::aarch64_neon_fminnm: 10154 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0), 10155 N->getOperand(1), N->getOperand(2)); 10156 case Intrinsic::aarch64_neon_smull: 10157 case Intrinsic::aarch64_neon_umull: 10158 case Intrinsic::aarch64_neon_pmull: 10159 case Intrinsic::aarch64_neon_sqdmull: 10160 return tryCombineLongOpWithDup(IID, N, DCI, DAG); 10161 case Intrinsic::aarch64_neon_sqshl: 10162 case Intrinsic::aarch64_neon_uqshl: 10163 case Intrinsic::aarch64_neon_sqshlu: 10164 case Intrinsic::aarch64_neon_srshl: 10165 case Intrinsic::aarch64_neon_urshl: 10166 return tryCombineShiftImm(IID, N, DAG); 10167 case Intrinsic::aarch64_crc32b: 10168 case Intrinsic::aarch64_crc32cb: 10169 return tryCombineCRC32(0xff, N, DAG); 10170 case Intrinsic::aarch64_crc32h: 10171 case Intrinsic::aarch64_crc32ch: 10172 return tryCombineCRC32(0xffff, N, DAG); 10173 } 10174 return SDValue(); 10175 } 10176 10177 static SDValue performExtendCombine(SDNode *N, 10178 TargetLowering::DAGCombinerInfo &DCI, 10179 SelectionDAG &DAG) { 10180 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then 10181 // we can convert that DUP into another extract_high (of a bigger DUP), which 10182 // helps the backend to decide that an sabdl2 would be useful, saving a real 10183 // extract_high operation. 10184 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND && 10185 N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) { 10186 SDNode *ABDNode = N->getOperand(0).getNode(); 10187 unsigned IID = getIntrinsicID(ABDNode); 10188 if (IID == Intrinsic::aarch64_neon_sabd || 10189 IID == Intrinsic::aarch64_neon_uabd) { 10190 SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG); 10191 if (!NewABD.getNode()) 10192 return SDValue(); 10193 10194 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), 10195 NewABD); 10196 } 10197 } 10198 10199 // This is effectively a custom type legalization for AArch64. 10200 // 10201 // Type legalization will split an extend of a small, legal, type to a larger 10202 // illegal type by first splitting the destination type, often creating 10203 // illegal source types, which then get legalized in isel-confusing ways, 10204 // leading to really terrible codegen. E.g., 10205 // %result = v8i32 sext v8i8 %value 10206 // becomes 10207 // %losrc = extract_subreg %value, ... 10208 // %hisrc = extract_subreg %value, ... 10209 // %lo = v4i32 sext v4i8 %losrc 10210 // %hi = v4i32 sext v4i8 %hisrc 10211 // Things go rapidly downhill from there. 10212 // 10213 // For AArch64, the [sz]ext vector instructions can only go up one element 10214 // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32 10215 // take two instructions. 10216 // 10217 // This implies that the most efficient way to do the extend from v8i8 10218 // to two v4i32 values is to first extend the v8i8 to v8i16, then do 10219 // the normal splitting to happen for the v8i16->v8i32. 10220 10221 // This is pre-legalization to catch some cases where the default 10222 // type legalization will create ill-tempered code. 10223 if (!DCI.isBeforeLegalizeOps()) 10224 return SDValue(); 10225 10226 // We're only interested in cleaning things up for non-legal vector types 10227 // here. If both the source and destination are legal, things will just 10228 // work naturally without any fiddling. 10229 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10230 EVT ResVT = N->getValueType(0); 10231 if (!ResVT.isVector() || TLI.isTypeLegal(ResVT)) 10232 return SDValue(); 10233 // If the vector type isn't a simple VT, it's beyond the scope of what 10234 // we're worried about here. Let legalization do its thing and hope for 10235 // the best. 10236 SDValue Src = N->getOperand(0); 10237 EVT SrcVT = Src->getValueType(0); 10238 if (!ResVT.isSimple() || !SrcVT.isSimple()) 10239 return SDValue(); 10240 10241 // If the source VT is a 64-bit vector, we can play games and get the 10242 // better results we want. 10243 if (SrcVT.getSizeInBits() != 64) 10244 return SDValue(); 10245 10246 unsigned SrcEltSize = SrcVT.getScalarSizeInBits(); 10247 unsigned ElementCount = SrcVT.getVectorNumElements(); 10248 SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount); 10249 SDLoc DL(N); 10250 Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src); 10251 10252 // Now split the rest of the operation into two halves, each with a 64 10253 // bit source. 10254 EVT LoVT, HiVT; 10255 SDValue Lo, Hi; 10256 unsigned NumElements = ResVT.getVectorNumElements(); 10257 assert(!(NumElements & 1) && "Splitting vector, but not in half!"); 10258 LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(), 10259 ResVT.getVectorElementType(), NumElements / 2); 10260 10261 EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(), 10262 LoVT.getVectorNumElements()); 10263 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, 10264 DAG.getConstant(0, DL, MVT::i64)); 10265 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, 10266 DAG.getConstant(InNVT.getVectorNumElements(), DL, MVT::i64)); 10267 Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo); 10268 Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi); 10269 10270 // Now combine the parts back together so we still have a single result 10271 // like the combiner expects. 10272 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi); 10273 } 10274 10275 static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, 10276 SDValue SplatVal, unsigned NumVecElts) { 10277 assert(!St.isTruncatingStore() && "cannot split truncating vector store"); 10278 unsigned OrigAlignment = St.getAlignment(); 10279 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8; 10280 10281 // Create scalar stores. This is at least as good as the code sequence for a 10282 // split unaligned store which is a dup.s, ext.b, and two stores. 10283 // Most of the time the three stores should be replaced by store pair 10284 // instructions (stp). 10285 SDLoc DL(&St); 10286 SDValue BasePtr = St.getBasePtr(); 10287 uint64_t BaseOffset = 0; 10288 10289 const MachinePointerInfo &PtrInfo = St.getPointerInfo(); 10290 SDValue NewST1 = 10291 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo, 10292 OrigAlignment, St.getMemOperand()->getFlags()); 10293 10294 // As this in ISel, we will not merge this add which may degrade results. 10295 if (BasePtr->getOpcode() == ISD::ADD && 10296 isa<ConstantSDNode>(BasePtr->getOperand(1))) { 10297 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue(); 10298 BasePtr = BasePtr->getOperand(0); 10299 } 10300 10301 unsigned Offset = EltOffset; 10302 while (--NumVecElts) { 10303 unsigned Alignment = MinAlign(OrigAlignment, Offset); 10304 SDValue OffsetPtr = 10305 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, 10306 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64)); 10307 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr, 10308 PtrInfo.getWithOffset(Offset), Alignment, 10309 St.getMemOperand()->getFlags()); 10310 Offset += EltOffset; 10311 } 10312 return NewST1; 10313 } 10314 10315 /// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The 10316 /// load store optimizer pass will merge them to store pair stores. This should 10317 /// be better than a movi to create the vector zero followed by a vector store 10318 /// if the zero constant is not re-used, since one instructions and one register 10319 /// live range will be removed. 10320 /// 10321 /// For example, the final generated code should be: 10322 /// 10323 /// stp xzr, xzr, [x0] 10324 /// 10325 /// instead of: 10326 /// 10327 /// movi v0.2d, #0 10328 /// str q0, [x0] 10329 /// 10330 static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) { 10331 SDValue StVal = St.getValue(); 10332 EVT VT = StVal.getValueType(); 10333 10334 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or 10335 // 2, 3 or 4 i32 elements. 10336 int NumVecElts = VT.getVectorNumElements(); 10337 if (!(((NumVecElts == 2 || NumVecElts == 3) && 10338 VT.getVectorElementType().getSizeInBits() == 64) || 10339 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) && 10340 VT.getVectorElementType().getSizeInBits() == 32))) 10341 return SDValue(); 10342 10343 if (StVal.getOpcode() != ISD::BUILD_VECTOR) 10344 return SDValue(); 10345 10346 // If the zero constant has more than one use then the vector store could be 10347 // better since the constant mov will be amortized and stp q instructions 10348 // should be able to be formed. 10349 if (!StVal.hasOneUse()) 10350 return SDValue(); 10351 10352 // If the store is truncating then it's going down to i16 or smaller, which 10353 // means it can be implemented in a single store anyway. 10354 if (St.isTruncatingStore()) 10355 return SDValue(); 10356 10357 // If the immediate offset of the address operand is too large for the stp 10358 // instruction, then bail out. 10359 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) { 10360 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1); 10361 if (Offset < -512 || Offset > 504) 10362 return SDValue(); 10363 } 10364 10365 for (int I = 0; I < NumVecElts; ++I) { 10366 SDValue EltVal = StVal.getOperand(I); 10367 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal)) 10368 return SDValue(); 10369 } 10370 10371 // Use a CopyFromReg WZR/XZR here to prevent 10372 // DAGCombiner::MergeConsecutiveStores from undoing this transformation. 10373 SDLoc DL(&St); 10374 unsigned ZeroReg; 10375 EVT ZeroVT; 10376 if (VT.getVectorElementType().getSizeInBits() == 32) { 10377 ZeroReg = AArch64::WZR; 10378 ZeroVT = MVT::i32; 10379 } else { 10380 ZeroReg = AArch64::XZR; 10381 ZeroVT = MVT::i64; 10382 } 10383 SDValue SplatVal = 10384 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT); 10385 return splitStoreSplat(DAG, St, SplatVal, NumVecElts); 10386 } 10387 10388 /// Replace a splat of a scalar to a vector store by scalar stores of the scalar 10389 /// value. The load store optimizer pass will merge them to store pair stores. 10390 /// This has better performance than a splat of the scalar followed by a split 10391 /// vector store. Even if the stores are not merged it is four stores vs a dup, 10392 /// followed by an ext.b and two stores. 10393 static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) { 10394 SDValue StVal = St.getValue(); 10395 EVT VT = StVal.getValueType(); 10396 10397 // Don't replace floating point stores, they possibly won't be transformed to 10398 // stp because of the store pair suppress pass. 10399 if (VT.isFloatingPoint()) 10400 return SDValue(); 10401 10402 // We can express a splat as store pair(s) for 2 or 4 elements. 10403 unsigned NumVecElts = VT.getVectorNumElements(); 10404 if (NumVecElts != 4 && NumVecElts != 2) 10405 return SDValue(); 10406 10407 // If the store is truncating then it's going down to i16 or smaller, which 10408 // means it can be implemented in a single store anyway. 10409 if (St.isTruncatingStore()) 10410 return SDValue(); 10411 10412 // Check that this is a splat. 10413 // Make sure that each of the relevant vector element locations are inserted 10414 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32. 10415 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1); 10416 SDValue SplatVal; 10417 for (unsigned I = 0; I < NumVecElts; ++I) { 10418 // Check for insert vector elements. 10419 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT) 10420 return SDValue(); 10421 10422 // Check that same value is inserted at each vector element. 10423 if (I == 0) 10424 SplatVal = StVal.getOperand(1); 10425 else if (StVal.getOperand(1) != SplatVal) 10426 return SDValue(); 10427 10428 // Check insert element index. 10429 ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2)); 10430 if (!CIndex) 10431 return SDValue(); 10432 uint64_t IndexVal = CIndex->getZExtValue(); 10433 if (IndexVal >= NumVecElts) 10434 return SDValue(); 10435 IndexNotInserted.reset(IndexVal); 10436 10437 StVal = StVal.getOperand(0); 10438 } 10439 // Check that all vector element locations were inserted to. 10440 if (IndexNotInserted.any()) 10441 return SDValue(); 10442 10443 return splitStoreSplat(DAG, St, SplatVal, NumVecElts); 10444 } 10445 10446 static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 10447 SelectionDAG &DAG, 10448 const AArch64Subtarget *Subtarget) { 10449 10450 StoreSDNode *S = cast<StoreSDNode>(N); 10451 if (S->isVolatile() || S->isIndexed()) 10452 return SDValue(); 10453 10454 SDValue StVal = S->getValue(); 10455 EVT VT = StVal.getValueType(); 10456 if (!VT.isVector()) 10457 return SDValue(); 10458 10459 // If we get a splat of zeros, convert this vector store to a store of 10460 // scalars. They will be merged into store pairs of xzr thereby removing one 10461 // instruction and one register. 10462 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S)) 10463 return ReplacedZeroSplat; 10464 10465 // FIXME: The logic for deciding if an unaligned store should be split should 10466 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be 10467 // a call to that function here. 10468 10469 if (!Subtarget->isMisaligned128StoreSlow()) 10470 return SDValue(); 10471 10472 // Don't split at -Oz. 10473 if (DAG.getMachineFunction().getFunction().hasMinSize()) 10474 return SDValue(); 10475 10476 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting 10477 // those up regresses performance on micro-benchmarks and olden/bh. 10478 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64) 10479 return SDValue(); 10480 10481 // Split unaligned 16B stores. They are terrible for performance. 10482 // Don't split stores with alignment of 1 or 2. Code that uses clang vector 10483 // extensions can use this to mark that it does not want splitting to happen 10484 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of 10485 // eliminating alignment hazards is only 1 in 8 for alignment of 2. 10486 if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 || 10487 S->getAlignment() <= 2) 10488 return SDValue(); 10489 10490 // If we get a splat of a scalar convert this vector store to a store of 10491 // scalars. They will be merged into store pairs thereby removing two 10492 // instructions. 10493 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S)) 10494 return ReplacedSplat; 10495 10496 SDLoc DL(S); 10497 unsigned NumElts = VT.getVectorNumElements() / 2; 10498 // Split VT into two. 10499 EVT HalfVT = 10500 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts); 10501 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, 10502 DAG.getConstant(0, DL, MVT::i64)); 10503 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, 10504 DAG.getConstant(NumElts, DL, MVT::i64)); 10505 SDValue BasePtr = S->getBasePtr(); 10506 SDValue NewST1 = 10507 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(), 10508 S->getAlignment(), S->getMemOperand()->getFlags()); 10509 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, 10510 DAG.getConstant(8, DL, MVT::i64)); 10511 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr, 10512 S->getPointerInfo(), S->getAlignment(), 10513 S->getMemOperand()->getFlags()); 10514 } 10515 10516 /// Target-specific DAG combine function for post-increment LD1 (lane) and 10517 /// post-increment LD1R. 10518 static SDValue performPostLD1Combine(SDNode *N, 10519 TargetLowering::DAGCombinerInfo &DCI, 10520 bool IsLaneOp) { 10521 if (DCI.isBeforeLegalizeOps()) 10522 return SDValue(); 10523 10524 SelectionDAG &DAG = DCI.DAG; 10525 EVT VT = N->getValueType(0); 10526 10527 unsigned LoadIdx = IsLaneOp ? 1 : 0; 10528 SDNode *LD = N->getOperand(LoadIdx).getNode(); 10529 // If it is not LOAD, can not do such combine. 10530 if (LD->getOpcode() != ISD::LOAD) 10531 return SDValue(); 10532 10533 // The vector lane must be a constant in the LD1LANE opcode. 10534 SDValue Lane; 10535 if (IsLaneOp) { 10536 Lane = N->getOperand(2); 10537 auto *LaneC = dyn_cast<ConstantSDNode>(Lane); 10538 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements()) 10539 return SDValue(); 10540 } 10541 10542 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD); 10543 EVT MemVT = LoadSDN->getMemoryVT(); 10544 // Check if memory operand is the same type as the vector element. 10545 if (MemVT != VT.getVectorElementType()) 10546 return SDValue(); 10547 10548 // Check if there are other uses. If so, do not combine as it will introduce 10549 // an extra load. 10550 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE; 10551 ++UI) { 10552 if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result. 10553 continue; 10554 if (*UI != N) 10555 return SDValue(); 10556 } 10557 10558 SDValue Addr = LD->getOperand(1); 10559 SDValue Vector = N->getOperand(0); 10560 // Search for a use of the address operand that is an increment. 10561 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE = 10562 Addr.getNode()->use_end(); UI != UE; ++UI) { 10563 SDNode *User = *UI; 10564 if (User->getOpcode() != ISD::ADD 10565 || UI.getUse().getResNo() != Addr.getResNo()) 10566 continue; 10567 10568 // If the increment is a constant, it must match the memory ref size. 10569 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 10570 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { 10571 uint32_t IncVal = CInc->getZExtValue(); 10572 unsigned NumBytes = VT.getScalarSizeInBits() / 8; 10573 if (IncVal != NumBytes) 10574 continue; 10575 Inc = DAG.getRegister(AArch64::XZR, MVT::i64); 10576 } 10577 10578 // To avoid cycle construction make sure that neither the load nor the add 10579 // are predecessors to each other or the Vector. 10580 SmallPtrSet<const SDNode *, 32> Visited; 10581 SmallVector<const SDNode *, 16> Worklist; 10582 Visited.insert(Addr.getNode()); 10583 Worklist.push_back(User); 10584 Worklist.push_back(LD); 10585 Worklist.push_back(Vector.getNode()); 10586 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) || 10587 SDNode::hasPredecessorHelper(User, Visited, Worklist)) 10588 continue; 10589 10590 SmallVector<SDValue, 8> Ops; 10591 Ops.push_back(LD->getOperand(0)); // Chain 10592 if (IsLaneOp) { 10593 Ops.push_back(Vector); // The vector to be inserted 10594 Ops.push_back(Lane); // The lane to be inserted in the vector 10595 } 10596 Ops.push_back(Addr); 10597 Ops.push_back(Inc); 10598 10599 EVT Tys[3] = { VT, MVT::i64, MVT::Other }; 10600 SDVTList SDTys = DAG.getVTList(Tys); 10601 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost; 10602 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops, 10603 MemVT, 10604 LoadSDN->getMemOperand()); 10605 10606 // Update the uses. 10607 SDValue NewResults[] = { 10608 SDValue(LD, 0), // The result of load 10609 SDValue(UpdN.getNode(), 2) // Chain 10610 }; 10611 DCI.CombineTo(LD, NewResults); 10612 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result 10613 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register 10614 10615 break; 10616 } 10617 return SDValue(); 10618 } 10619 10620 /// Simplify ``Addr`` given that the top byte of it is ignored by HW during 10621 /// address translation. 10622 static bool performTBISimplification(SDValue Addr, 10623 TargetLowering::DAGCombinerInfo &DCI, 10624 SelectionDAG &DAG) { 10625 APInt DemandedMask = APInt::getLowBitsSet(64, 56); 10626 KnownBits Known; 10627 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 10628 !DCI.isBeforeLegalizeOps()); 10629 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10630 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) { 10631 DCI.CommitTargetLoweringOpt(TLO); 10632 return true; 10633 } 10634 return false; 10635 } 10636 10637 static SDValue performSTORECombine(SDNode *N, 10638 TargetLowering::DAGCombinerInfo &DCI, 10639 SelectionDAG &DAG, 10640 const AArch64Subtarget *Subtarget) { 10641 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget)) 10642 return Split; 10643 10644 if (Subtarget->supportsAddressTopByteIgnored() && 10645 performTBISimplification(N->getOperand(2), DCI, DAG)) 10646 return SDValue(N, 0); 10647 10648 return SDValue(); 10649 } 10650 10651 10652 /// Target-specific DAG combine function for NEON load/store intrinsics 10653 /// to merge base address updates. 10654 static SDValue performNEONPostLDSTCombine(SDNode *N, 10655 TargetLowering::DAGCombinerInfo &DCI, 10656 SelectionDAG &DAG) { 10657 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 10658 return SDValue(); 10659 10660 unsigned AddrOpIdx = N->getNumOperands() - 1; 10661 SDValue Addr = N->getOperand(AddrOpIdx); 10662 10663 // Search for a use of the address operand that is an increment. 10664 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 10665 UE = Addr.getNode()->use_end(); UI != UE; ++UI) { 10666 SDNode *User = *UI; 10667 if (User->getOpcode() != ISD::ADD || 10668 UI.getUse().getResNo() != Addr.getResNo()) 10669 continue; 10670 10671 // Check that the add is independent of the load/store. Otherwise, folding 10672 // it would create a cycle. 10673 SmallPtrSet<const SDNode *, 32> Visited; 10674 SmallVector<const SDNode *, 16> Worklist; 10675 Visited.insert(Addr.getNode()); 10676 Worklist.push_back(N); 10677 Worklist.push_back(User); 10678 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || 10679 SDNode::hasPredecessorHelper(User, Visited, Worklist)) 10680 continue; 10681 10682 // Find the new opcode for the updating load/store. 10683 bool IsStore = false; 10684 bool IsLaneOp = false; 10685 bool IsDupOp = false; 10686 unsigned NewOpc = 0; 10687 unsigned NumVecs = 0; 10688 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 10689 switch (IntNo) { 10690 default: llvm_unreachable("unexpected intrinsic for Neon base update"); 10691 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post; 10692 NumVecs = 2; break; 10693 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post; 10694 NumVecs = 3; break; 10695 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post; 10696 NumVecs = 4; break; 10697 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post; 10698 NumVecs = 2; IsStore = true; break; 10699 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post; 10700 NumVecs = 3; IsStore = true; break; 10701 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post; 10702 NumVecs = 4; IsStore = true; break; 10703 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post; 10704 NumVecs = 2; break; 10705 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post; 10706 NumVecs = 3; break; 10707 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post; 10708 NumVecs = 4; break; 10709 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post; 10710 NumVecs = 2; IsStore = true; break; 10711 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post; 10712 NumVecs = 3; IsStore = true; break; 10713 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post; 10714 NumVecs = 4; IsStore = true; break; 10715 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost; 10716 NumVecs = 2; IsDupOp = true; break; 10717 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost; 10718 NumVecs = 3; IsDupOp = true; break; 10719 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost; 10720 NumVecs = 4; IsDupOp = true; break; 10721 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost; 10722 NumVecs = 2; IsLaneOp = true; break; 10723 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost; 10724 NumVecs = 3; IsLaneOp = true; break; 10725 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost; 10726 NumVecs = 4; IsLaneOp = true; break; 10727 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost; 10728 NumVecs = 2; IsStore = true; IsLaneOp = true; break; 10729 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost; 10730 NumVecs = 3; IsStore = true; IsLaneOp = true; break; 10731 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost; 10732 NumVecs = 4; IsStore = true; IsLaneOp = true; break; 10733 } 10734 10735 EVT VecTy; 10736 if (IsStore) 10737 VecTy = N->getOperand(2).getValueType(); 10738 else 10739 VecTy = N->getValueType(0); 10740 10741 // If the increment is a constant, it must match the memory ref size. 10742 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 10743 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { 10744 uint32_t IncVal = CInc->getZExtValue(); 10745 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 10746 if (IsLaneOp || IsDupOp) 10747 NumBytes /= VecTy.getVectorNumElements(); 10748 if (IncVal != NumBytes) 10749 continue; 10750 Inc = DAG.getRegister(AArch64::XZR, MVT::i64); 10751 } 10752 SmallVector<SDValue, 8> Ops; 10753 Ops.push_back(N->getOperand(0)); // Incoming chain 10754 // Load lane and store have vector list as input. 10755 if (IsLaneOp || IsStore) 10756 for (unsigned i = 2; i < AddrOpIdx; ++i) 10757 Ops.push_back(N->getOperand(i)); 10758 Ops.push_back(Addr); // Base register 10759 Ops.push_back(Inc); 10760 10761 // Return Types. 10762 EVT Tys[6]; 10763 unsigned NumResultVecs = (IsStore ? 0 : NumVecs); 10764 unsigned n; 10765 for (n = 0; n < NumResultVecs; ++n) 10766 Tys[n] = VecTy; 10767 Tys[n++] = MVT::i64; // Type of write back register 10768 Tys[n] = MVT::Other; // Type of the chain 10769 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2)); 10770 10771 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N); 10772 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops, 10773 MemInt->getMemoryVT(), 10774 MemInt->getMemOperand()); 10775 10776 // Update the uses. 10777 std::vector<SDValue> NewResults; 10778 for (unsigned i = 0; i < NumResultVecs; ++i) { 10779 NewResults.push_back(SDValue(UpdN.getNode(), i)); 10780 } 10781 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); 10782 DCI.CombineTo(N, NewResults); 10783 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 10784 10785 break; 10786 } 10787 return SDValue(); 10788 } 10789 10790 // Checks to see if the value is the prescribed width and returns information 10791 // about its extension mode. 10792 static 10793 bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) { 10794 ExtType = ISD::NON_EXTLOAD; 10795 switch(V.getNode()->getOpcode()) { 10796 default: 10797 return false; 10798 case ISD::LOAD: { 10799 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode()); 10800 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8) 10801 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) { 10802 ExtType = LoadNode->getExtensionType(); 10803 return true; 10804 } 10805 return false; 10806 } 10807 case ISD::AssertSext: { 10808 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1)); 10809 if ((TypeNode->getVT() == MVT::i8 && width == 8) 10810 || (TypeNode->getVT() == MVT::i16 && width == 16)) { 10811 ExtType = ISD::SEXTLOAD; 10812 return true; 10813 } 10814 return false; 10815 } 10816 case ISD::AssertZext: { 10817 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1)); 10818 if ((TypeNode->getVT() == MVT::i8 && width == 8) 10819 || (TypeNode->getVT() == MVT::i16 && width == 16)) { 10820 ExtType = ISD::ZEXTLOAD; 10821 return true; 10822 } 10823 return false; 10824 } 10825 case ISD::Constant: 10826 case ISD::TargetConstant: { 10827 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) < 10828 1LL << (width - 1); 10829 } 10830 } 10831 10832 return true; 10833 } 10834 10835 // This function does a whole lot of voodoo to determine if the tests are 10836 // equivalent without and with a mask. Essentially what happens is that given a 10837 // DAG resembling: 10838 // 10839 // +-------------+ +-------------+ +-------------+ +-------------+ 10840 // | Input | | AddConstant | | CompConstant| | CC | 10841 // +-------------+ +-------------+ +-------------+ +-------------+ 10842 // | | | | 10843 // V V | +----------+ 10844 // +-------------+ +----+ | | 10845 // | ADD | |0xff| | | 10846 // +-------------+ +----+ | | 10847 // | | | | 10848 // V V | | 10849 // +-------------+ | | 10850 // | AND | | | 10851 // +-------------+ | | 10852 // | | | 10853 // +-----+ | | 10854 // | | | 10855 // V V V 10856 // +-------------+ 10857 // | CMP | 10858 // +-------------+ 10859 // 10860 // The AND node may be safely removed for some combinations of inputs. In 10861 // particular we need to take into account the extension type of the Input, 10862 // the exact values of AddConstant, CompConstant, and CC, along with the nominal 10863 // width of the input (this can work for any width inputs, the above graph is 10864 // specific to 8 bits. 10865 // 10866 // The specific equations were worked out by generating output tables for each 10867 // AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The 10868 // problem was simplified by working with 4 bit inputs, which means we only 10869 // needed to reason about 24 distinct bit patterns: 8 patterns unique to zero 10870 // extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8 10871 // patterns present in both extensions (0,7). For every distinct set of 10872 // AddConstant and CompConstants bit patterns we can consider the masked and 10873 // unmasked versions to be equivalent if the result of this function is true for 10874 // all 16 distinct bit patterns of for the current extension type of Input (w0). 10875 // 10876 // sub w8, w0, w1 10877 // and w10, w8, #0x0f 10878 // cmp w8, w2 10879 // cset w9, AArch64CC 10880 // cmp w10, w2 10881 // cset w11, AArch64CC 10882 // cmp w9, w11 10883 // cset w0, eq 10884 // ret 10885 // 10886 // Since the above function shows when the outputs are equivalent it defines 10887 // when it is safe to remove the AND. Unfortunately it only runs on AArch64 and 10888 // would be expensive to run during compiles. The equations below were written 10889 // in a test harness that confirmed they gave equivalent outputs to the above 10890 // for all inputs function, so they can be used determine if the removal is 10891 // legal instead. 10892 // 10893 // isEquivalentMaskless() is the code for testing if the AND can be removed 10894 // factored out of the DAG recognition as the DAG can take several forms. 10895 10896 static bool isEquivalentMaskless(unsigned CC, unsigned width, 10897 ISD::LoadExtType ExtType, int AddConstant, 10898 int CompConstant) { 10899 // By being careful about our equations and only writing the in term 10900 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can 10901 // make them generally applicable to all bit widths. 10902 int MaxUInt = (1 << width); 10903 10904 // For the purposes of these comparisons sign extending the type is 10905 // equivalent to zero extending the add and displacing it by half the integer 10906 // width. Provided we are careful and make sure our equations are valid over 10907 // the whole range we can just adjust the input and avoid writing equations 10908 // for sign extended inputs. 10909 if (ExtType == ISD::SEXTLOAD) 10910 AddConstant -= (1 << (width-1)); 10911 10912 switch(CC) { 10913 case AArch64CC::LE: 10914 case AArch64CC::GT: 10915 if ((AddConstant == 0) || 10916 (CompConstant == MaxUInt - 1 && AddConstant < 0) || 10917 (AddConstant >= 0 && CompConstant < 0) || 10918 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant)) 10919 return true; 10920 break; 10921 case AArch64CC::LT: 10922 case AArch64CC::GE: 10923 if ((AddConstant == 0) || 10924 (AddConstant >= 0 && CompConstant <= 0) || 10925 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant)) 10926 return true; 10927 break; 10928 case AArch64CC::HI: 10929 case AArch64CC::LS: 10930 if ((AddConstant >= 0 && CompConstant < 0) || 10931 (AddConstant <= 0 && CompConstant >= -1 && 10932 CompConstant < AddConstant + MaxUInt)) 10933 return true; 10934 break; 10935 case AArch64CC::PL: 10936 case AArch64CC::MI: 10937 if ((AddConstant == 0) || 10938 (AddConstant > 0 && CompConstant <= 0) || 10939 (AddConstant < 0 && CompConstant <= AddConstant)) 10940 return true; 10941 break; 10942 case AArch64CC::LO: 10943 case AArch64CC::HS: 10944 if ((AddConstant >= 0 && CompConstant <= 0) || 10945 (AddConstant <= 0 && CompConstant >= 0 && 10946 CompConstant <= AddConstant + MaxUInt)) 10947 return true; 10948 break; 10949 case AArch64CC::EQ: 10950 case AArch64CC::NE: 10951 if ((AddConstant > 0 && CompConstant < 0) || 10952 (AddConstant < 0 && CompConstant >= 0 && 10953 CompConstant < AddConstant + MaxUInt) || 10954 (AddConstant >= 0 && CompConstant >= 0 && 10955 CompConstant >= AddConstant) || 10956 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant)) 10957 return true; 10958 break; 10959 case AArch64CC::VS: 10960 case AArch64CC::VC: 10961 case AArch64CC::AL: 10962 case AArch64CC::NV: 10963 return true; 10964 case AArch64CC::Invalid: 10965 break; 10966 } 10967 10968 return false; 10969 } 10970 10971 static 10972 SDValue performCONDCombine(SDNode *N, 10973 TargetLowering::DAGCombinerInfo &DCI, 10974 SelectionDAG &DAG, unsigned CCIndex, 10975 unsigned CmpIndex) { 10976 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue(); 10977 SDNode *SubsNode = N->getOperand(CmpIndex).getNode(); 10978 unsigned CondOpcode = SubsNode->getOpcode(); 10979 10980 if (CondOpcode != AArch64ISD::SUBS) 10981 return SDValue(); 10982 10983 // There is a SUBS feeding this condition. Is it fed by a mask we can 10984 // use? 10985 10986 SDNode *AndNode = SubsNode->getOperand(0).getNode(); 10987 unsigned MaskBits = 0; 10988 10989 if (AndNode->getOpcode() != ISD::AND) 10990 return SDValue(); 10991 10992 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) { 10993 uint32_t CNV = CN->getZExtValue(); 10994 if (CNV == 255) 10995 MaskBits = 8; 10996 else if (CNV == 65535) 10997 MaskBits = 16; 10998 } 10999 11000 if (!MaskBits) 11001 return SDValue(); 11002 11003 SDValue AddValue = AndNode->getOperand(0); 11004 11005 if (AddValue.getOpcode() != ISD::ADD) 11006 return SDValue(); 11007 11008 // The basic dag structure is correct, grab the inputs and validate them. 11009 11010 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0); 11011 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1); 11012 SDValue SubsInputValue = SubsNode->getOperand(1); 11013 11014 // The mask is present and the provenance of all the values is a smaller type, 11015 // lets see if the mask is superfluous. 11016 11017 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) || 11018 !isa<ConstantSDNode>(SubsInputValue.getNode())) 11019 return SDValue(); 11020 11021 ISD::LoadExtType ExtType; 11022 11023 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) || 11024 !checkValueWidth(AddInputValue2, MaskBits, ExtType) || 11025 !checkValueWidth(AddInputValue1, MaskBits, ExtType) ) 11026 return SDValue(); 11027 11028 if(!isEquivalentMaskless(CC, MaskBits, ExtType, 11029 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(), 11030 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue())) 11031 return SDValue(); 11032 11033 // The AND is not necessary, remove it. 11034 11035 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0), 11036 SubsNode->getValueType(1)); 11037 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) }; 11038 11039 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops); 11040 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode()); 11041 11042 return SDValue(N, 0); 11043 } 11044 11045 // Optimize compare with zero and branch. 11046 static SDValue performBRCONDCombine(SDNode *N, 11047 TargetLowering::DAGCombinerInfo &DCI, 11048 SelectionDAG &DAG) { 11049 MachineFunction &MF = DAG.getMachineFunction(); 11050 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions 11051 // will not be produced, as they are conditional branch instructions that do 11052 // not set flags. 11053 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening)) 11054 return SDValue(); 11055 11056 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3)) 11057 N = NV.getNode(); 11058 SDValue Chain = N->getOperand(0); 11059 SDValue Dest = N->getOperand(1); 11060 SDValue CCVal = N->getOperand(2); 11061 SDValue Cmp = N->getOperand(3); 11062 11063 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!"); 11064 unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue(); 11065 if (CC != AArch64CC::EQ && CC != AArch64CC::NE) 11066 return SDValue(); 11067 11068 unsigned CmpOpc = Cmp.getOpcode(); 11069 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS) 11070 return SDValue(); 11071 11072 // Only attempt folding if there is only one use of the flag and no use of the 11073 // value. 11074 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1)) 11075 return SDValue(); 11076 11077 SDValue LHS = Cmp.getOperand(0); 11078 SDValue RHS = Cmp.getOperand(1); 11079 11080 assert(LHS.getValueType() == RHS.getValueType() && 11081 "Expected the value type to be the same for both operands!"); 11082 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) 11083 return SDValue(); 11084 11085 if (isNullConstant(LHS)) 11086 std::swap(LHS, RHS); 11087 11088 if (!isNullConstant(RHS)) 11089 return SDValue(); 11090 11091 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA || 11092 LHS.getOpcode() == ISD::SRL) 11093 return SDValue(); 11094 11095 // Fold the compare into the branch instruction. 11096 SDValue BR; 11097 if (CC == AArch64CC::EQ) 11098 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); 11099 else 11100 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); 11101 11102 // Do not add new nodes to DAG combiner worklist. 11103 DCI.CombineTo(N, BR, false); 11104 11105 return SDValue(); 11106 } 11107 11108 // Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test 11109 // as well as whether the test should be inverted. This code is required to 11110 // catch these cases (as opposed to standard dag combines) because 11111 // AArch64ISD::TBZ is matched during legalization. 11112 static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, 11113 SelectionDAG &DAG) { 11114 11115 if (!Op->hasOneUse()) 11116 return Op; 11117 11118 // We don't handle undef/constant-fold cases below, as they should have 11119 // already been taken care of (e.g. and of 0, test of undefined shifted bits, 11120 // etc.) 11121 11122 // (tbz (trunc x), b) -> (tbz x, b) 11123 // This case is just here to enable more of the below cases to be caught. 11124 if (Op->getOpcode() == ISD::TRUNCATE && 11125 Bit < Op->getValueType(0).getSizeInBits()) { 11126 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 11127 } 11128 11129 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits. 11130 if (Op->getOpcode() == ISD::ANY_EXTEND && 11131 Bit < Op->getOperand(0).getValueSizeInBits()) { 11132 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 11133 } 11134 11135 if (Op->getNumOperands() != 2) 11136 return Op; 11137 11138 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 11139 if (!C) 11140 return Op; 11141 11142 switch (Op->getOpcode()) { 11143 default: 11144 return Op; 11145 11146 // (tbz (and x, m), b) -> (tbz x, b) 11147 case ISD::AND: 11148 if ((C->getZExtValue() >> Bit) & 1) 11149 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 11150 return Op; 11151 11152 // (tbz (shl x, c), b) -> (tbz x, b-c) 11153 case ISD::SHL: 11154 if (C->getZExtValue() <= Bit && 11155 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) { 11156 Bit = Bit - C->getZExtValue(); 11157 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 11158 } 11159 return Op; 11160 11161 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x 11162 case ISD::SRA: 11163 Bit = Bit + C->getZExtValue(); 11164 if (Bit >= Op->getValueType(0).getSizeInBits()) 11165 Bit = Op->getValueType(0).getSizeInBits() - 1; 11166 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 11167 11168 // (tbz (srl x, c), b) -> (tbz x, b+c) 11169 case ISD::SRL: 11170 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) { 11171 Bit = Bit + C->getZExtValue(); 11172 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 11173 } 11174 return Op; 11175 11176 // (tbz (xor x, -1), b) -> (tbnz x, b) 11177 case ISD::XOR: 11178 if ((C->getZExtValue() >> Bit) & 1) 11179 Invert = !Invert; 11180 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); 11181 } 11182 } 11183 11184 // Optimize test single bit zero/non-zero and branch. 11185 static SDValue performTBZCombine(SDNode *N, 11186 TargetLowering::DAGCombinerInfo &DCI, 11187 SelectionDAG &DAG) { 11188 unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 11189 bool Invert = false; 11190 SDValue TestSrc = N->getOperand(1); 11191 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG); 11192 11193 if (TestSrc == NewTestSrc) 11194 return SDValue(); 11195 11196 unsigned NewOpc = N->getOpcode(); 11197 if (Invert) { 11198 if (NewOpc == AArch64ISD::TBZ) 11199 NewOpc = AArch64ISD::TBNZ; 11200 else { 11201 assert(NewOpc == AArch64ISD::TBNZ); 11202 NewOpc = AArch64ISD::TBZ; 11203 } 11204 } 11205 11206 SDLoc DL(N); 11207 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc, 11208 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3)); 11209 } 11210 11211 // vselect (v1i1 setcc) -> 11212 // vselect (v1iXX setcc) (XX is the size of the compared operand type) 11213 // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as 11214 // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine 11215 // such VSELECT. 11216 static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) { 11217 SDValue N0 = N->getOperand(0); 11218 EVT CCVT = N0.getValueType(); 11219 11220 if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 || 11221 CCVT.getVectorElementType() != MVT::i1) 11222 return SDValue(); 11223 11224 EVT ResVT = N->getValueType(0); 11225 EVT CmpVT = N0.getOperand(0).getValueType(); 11226 // Only combine when the result type is of the same size as the compared 11227 // operands. 11228 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits()) 11229 return SDValue(); 11230 11231 SDValue IfTrue = N->getOperand(1); 11232 SDValue IfFalse = N->getOperand(2); 11233 SDValue SetCC = 11234 DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(), 11235 N0.getOperand(0), N0.getOperand(1), 11236 cast<CondCodeSDNode>(N0.getOperand(2))->get()); 11237 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC, 11238 IfTrue, IfFalse); 11239 } 11240 11241 /// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with 11242 /// the compare-mask instructions rather than going via NZCV, even if LHS and 11243 /// RHS are really scalar. This replaces any scalar setcc in the above pattern 11244 /// with a vector one followed by a DUP shuffle on the result. 11245 static SDValue performSelectCombine(SDNode *N, 11246 TargetLowering::DAGCombinerInfo &DCI) { 11247 SelectionDAG &DAG = DCI.DAG; 11248 SDValue N0 = N->getOperand(0); 11249 EVT ResVT = N->getValueType(0); 11250 11251 if (N0.getOpcode() != ISD::SETCC) 11252 return SDValue(); 11253 11254 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered 11255 // scalar SetCCResultType. We also don't expect vectors, because we assume 11256 // that selects fed by vector SETCCs are canonicalized to VSELECT. 11257 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) && 11258 "Scalar-SETCC feeding SELECT has unexpected result type!"); 11259 11260 // If NumMaskElts == 0, the comparison is larger than select result. The 11261 // largest real NEON comparison is 64-bits per lane, which means the result is 11262 // at most 32-bits and an illegal vector. Just bail out for now. 11263 EVT SrcVT = N0.getOperand(0).getValueType(); 11264 11265 // Don't try to do this optimization when the setcc itself has i1 operands. 11266 // There are no legal vectors of i1, so this would be pointless. 11267 if (SrcVT == MVT::i1) 11268 return SDValue(); 11269 11270 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits(); 11271 if (!ResVT.isVector() || NumMaskElts == 0) 11272 return SDValue(); 11273 11274 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts); 11275 EVT CCVT = SrcVT.changeVectorElementTypeToInteger(); 11276 11277 // Also bail out if the vector CCVT isn't the same size as ResVT. 11278 // This can happen if the SETCC operand size doesn't divide the ResVT size 11279 // (e.g., f64 vs v3f32). 11280 if (CCVT.getSizeInBits() != ResVT.getSizeInBits()) 11281 return SDValue(); 11282 11283 // Make sure we didn't create illegal types, if we're not supposed to. 11284 assert(DCI.isBeforeLegalize() || 11285 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)); 11286 11287 // First perform a vector comparison, where lane 0 is the one we're interested 11288 // in. 11289 SDLoc DL(N0); 11290 SDValue LHS = 11291 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0)); 11292 SDValue RHS = 11293 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1)); 11294 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2)); 11295 11296 // Now duplicate the comparison mask we want across all other lanes. 11297 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0); 11298 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask); 11299 Mask = DAG.getNode(ISD::BITCAST, DL, 11300 ResVT.changeVectorElementTypeToInteger(), Mask); 11301 11302 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2)); 11303 } 11304 11305 /// Get rid of unnecessary NVCASTs (that don't change the type). 11306 static SDValue performNVCASTCombine(SDNode *N) { 11307 if (N->getValueType(0) == N->getOperand(0).getValueType()) 11308 return N->getOperand(0); 11309 11310 return SDValue(); 11311 } 11312 11313 // If all users of the globaladdr are of the form (globaladdr + constant), find 11314 // the smallest constant, fold it into the globaladdr's offset and rewrite the 11315 // globaladdr as (globaladdr + constant) - constant. 11316 static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, 11317 const AArch64Subtarget *Subtarget, 11318 const TargetMachine &TM) { 11319 auto *GN = cast<GlobalAddressSDNode>(N); 11320 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) != 11321 AArch64II::MO_NO_FLAG) 11322 return SDValue(); 11323 11324 uint64_t MinOffset = -1ull; 11325 for (SDNode *N : GN->uses()) { 11326 if (N->getOpcode() != ISD::ADD) 11327 return SDValue(); 11328 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0)); 11329 if (!C) 11330 C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 11331 if (!C) 11332 return SDValue(); 11333 MinOffset = std::min(MinOffset, C->getZExtValue()); 11334 } 11335 uint64_t Offset = MinOffset + GN->getOffset(); 11336 11337 // Require that the new offset is larger than the existing one. Otherwise, we 11338 // can end up oscillating between two possible DAGs, for example, 11339 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1). 11340 if (Offset <= uint64_t(GN->getOffset())) 11341 return SDValue(); 11342 11343 // Check whether folding this offset is legal. It must not go out of bounds of 11344 // the referenced object to avoid violating the code model, and must be 11345 // smaller than 2^21 because this is the largest offset expressible in all 11346 // object formats. 11347 // 11348 // This check also prevents us from folding negative offsets, which will end 11349 // up being treated in the same way as large positive ones. They could also 11350 // cause code model violations, and aren't really common enough to matter. 11351 if (Offset >= (1 << 21)) 11352 return SDValue(); 11353 11354 const GlobalValue *GV = GN->getGlobal(); 11355 Type *T = GV->getValueType(); 11356 if (!T->isSized() || 11357 Offset > GV->getParent()->getDataLayout().getTypeAllocSize(T)) 11358 return SDValue(); 11359 11360 SDLoc DL(GN); 11361 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset); 11362 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result, 11363 DAG.getConstant(MinOffset, DL, MVT::i64)); 11364 } 11365 11366 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, 11367 DAGCombinerInfo &DCI) const { 11368 SelectionDAG &DAG = DCI.DAG; 11369 switch (N->getOpcode()) { 11370 default: 11371 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n"); 11372 break; 11373 case ISD::ADD: 11374 case ISD::SUB: 11375 return performAddSubLongCombine(N, DCI, DAG); 11376 case ISD::XOR: 11377 return performXorCombine(N, DAG, DCI, Subtarget); 11378 case ISD::MUL: 11379 return performMulCombine(N, DAG, DCI, Subtarget); 11380 case ISD::SINT_TO_FP: 11381 case ISD::UINT_TO_FP: 11382 return performIntToFpCombine(N, DAG, Subtarget); 11383 case ISD::FP_TO_SINT: 11384 case ISD::FP_TO_UINT: 11385 return performFpToIntCombine(N, DAG, DCI, Subtarget); 11386 case ISD::FDIV: 11387 return performFDivCombine(N, DAG, DCI, Subtarget); 11388 case ISD::OR: 11389 return performORCombine(N, DCI, Subtarget); 11390 case ISD::AND: 11391 return performANDCombine(N, DCI); 11392 case ISD::SRL: 11393 return performSRLCombine(N, DCI); 11394 case ISD::INTRINSIC_WO_CHAIN: 11395 return performIntrinsicCombine(N, DCI, Subtarget); 11396 case ISD::ANY_EXTEND: 11397 case ISD::ZERO_EXTEND: 11398 case ISD::SIGN_EXTEND: 11399 return performExtendCombine(N, DCI, DAG); 11400 case ISD::BITCAST: 11401 return performBitcastCombine(N, DCI, DAG); 11402 case ISD::CONCAT_VECTORS: 11403 return performConcatVectorsCombine(N, DCI, DAG); 11404 case ISD::SELECT: 11405 return performSelectCombine(N, DCI); 11406 case ISD::VSELECT: 11407 return performVSelectCombine(N, DCI.DAG); 11408 case ISD::LOAD: 11409 if (performTBISimplification(N->getOperand(1), DCI, DAG)) 11410 return SDValue(N, 0); 11411 break; 11412 case ISD::STORE: 11413 return performSTORECombine(N, DCI, DAG, Subtarget); 11414 case AArch64ISD::BRCOND: 11415 return performBRCONDCombine(N, DCI, DAG); 11416 case AArch64ISD::TBNZ: 11417 case AArch64ISD::TBZ: 11418 return performTBZCombine(N, DCI, DAG); 11419 case AArch64ISD::CSEL: 11420 return performCONDCombine(N, DCI, DAG, 2, 3); 11421 case AArch64ISD::DUP: 11422 return performPostLD1Combine(N, DCI, false); 11423 case AArch64ISD::NVCAST: 11424 return performNVCASTCombine(N); 11425 case ISD::INSERT_VECTOR_ELT: 11426 return performPostLD1Combine(N, DCI, true); 11427 case ISD::INTRINSIC_VOID: 11428 case ISD::INTRINSIC_W_CHAIN: 11429 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 11430 case Intrinsic::aarch64_neon_ld2: 11431 case Intrinsic::aarch64_neon_ld3: 11432 case Intrinsic::aarch64_neon_ld4: 11433 case Intrinsic::aarch64_neon_ld1x2: 11434 case Intrinsic::aarch64_neon_ld1x3: 11435 case Intrinsic::aarch64_neon_ld1x4: 11436 case Intrinsic::aarch64_neon_ld2lane: 11437 case Intrinsic::aarch64_neon_ld3lane: 11438 case Intrinsic::aarch64_neon_ld4lane: 11439 case Intrinsic::aarch64_neon_ld2r: 11440 case Intrinsic::aarch64_neon_ld3r: 11441 case Intrinsic::aarch64_neon_ld4r: 11442 case Intrinsic::aarch64_neon_st2: 11443 case Intrinsic::aarch64_neon_st3: 11444 case Intrinsic::aarch64_neon_st4: 11445 case Intrinsic::aarch64_neon_st1x2: 11446 case Intrinsic::aarch64_neon_st1x3: 11447 case Intrinsic::aarch64_neon_st1x4: 11448 case Intrinsic::aarch64_neon_st2lane: 11449 case Intrinsic::aarch64_neon_st3lane: 11450 case Intrinsic::aarch64_neon_st4lane: 11451 return performNEONPostLDSTCombine(N, DCI, DAG); 11452 default: 11453 break; 11454 } 11455 break; 11456 case ISD::GlobalAddress: 11457 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine()); 11458 } 11459 return SDValue(); 11460 } 11461 11462 // Check if the return value is used as only a return value, as otherwise 11463 // we can't perform a tail-call. In particular, we need to check for 11464 // target ISD nodes that are returns and any other "odd" constructs 11465 // that the generic analysis code won't necessarily catch. 11466 bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N, 11467 SDValue &Chain) const { 11468 if (N->getNumValues() != 1) 11469 return false; 11470 if (!N->hasNUsesOfValue(1, 0)) 11471 return false; 11472 11473 SDValue TCChain = Chain; 11474 SDNode *Copy = *N->use_begin(); 11475 if (Copy->getOpcode() == ISD::CopyToReg) { 11476 // If the copy has a glue operand, we conservatively assume it isn't safe to 11477 // perform a tail call. 11478 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() == 11479 MVT::Glue) 11480 return false; 11481 TCChain = Copy->getOperand(0); 11482 } else if (Copy->getOpcode() != ISD::FP_EXTEND) 11483 return false; 11484 11485 bool HasRet = false; 11486 for (SDNode *Node : Copy->uses()) { 11487 if (Node->getOpcode() != AArch64ISD::RET_FLAG) 11488 return false; 11489 HasRet = true; 11490 } 11491 11492 if (!HasRet) 11493 return false; 11494 11495 Chain = TCChain; 11496 return true; 11497 } 11498 11499 // Return whether the an instruction can potentially be optimized to a tail 11500 // call. This will cause the optimizers to attempt to move, or duplicate, 11501 // return instructions to help enable tail call optimizations for this 11502 // instruction. 11503 bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { 11504 return CI->isTailCall(); 11505 } 11506 11507 bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base, 11508 SDValue &Offset, 11509 ISD::MemIndexedMode &AM, 11510 bool &IsInc, 11511 SelectionDAG &DAG) const { 11512 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB) 11513 return false; 11514 11515 Base = Op->getOperand(0); 11516 // All of the indexed addressing mode instructions take a signed 11517 // 9 bit immediate offset. 11518 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) { 11519 int64_t RHSC = RHS->getSExtValue(); 11520 if (Op->getOpcode() == ISD::SUB) 11521 RHSC = -(uint64_t)RHSC; 11522 if (!isInt<9>(RHSC)) 11523 return false; 11524 IsInc = (Op->getOpcode() == ISD::ADD); 11525 Offset = Op->getOperand(1); 11526 return true; 11527 } 11528 return false; 11529 } 11530 11531 bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 11532 SDValue &Offset, 11533 ISD::MemIndexedMode &AM, 11534 SelectionDAG &DAG) const { 11535 EVT VT; 11536 SDValue Ptr; 11537 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 11538 VT = LD->getMemoryVT(); 11539 Ptr = LD->getBasePtr(); 11540 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 11541 VT = ST->getMemoryVT(); 11542 Ptr = ST->getBasePtr(); 11543 } else 11544 return false; 11545 11546 bool IsInc; 11547 if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG)) 11548 return false; 11549 AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC; 11550 return true; 11551 } 11552 11553 bool AArch64TargetLowering::getPostIndexedAddressParts( 11554 SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, 11555 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { 11556 EVT VT; 11557 SDValue Ptr; 11558 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 11559 VT = LD->getMemoryVT(); 11560 Ptr = LD->getBasePtr(); 11561 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 11562 VT = ST->getMemoryVT(); 11563 Ptr = ST->getBasePtr(); 11564 } else 11565 return false; 11566 11567 bool IsInc; 11568 if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG)) 11569 return false; 11570 // Post-indexing updates the base, so it's not a valid transform 11571 // if that's not the same as the load's pointer. 11572 if (Ptr != Base) 11573 return false; 11574 AM = IsInc ? ISD::POST_INC : ISD::POST_DEC; 11575 return true; 11576 } 11577 11578 static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results, 11579 SelectionDAG &DAG) { 11580 SDLoc DL(N); 11581 SDValue Op = N->getOperand(0); 11582 11583 if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16) 11584 return; 11585 11586 Op = SDValue( 11587 DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32, 11588 DAG.getUNDEF(MVT::i32), Op, 11589 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), 11590 0); 11591 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op); 11592 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op)); 11593 } 11594 11595 static void ReplaceReductionResults(SDNode *N, 11596 SmallVectorImpl<SDValue> &Results, 11597 SelectionDAG &DAG, unsigned InterOp, 11598 unsigned AcrossOp) { 11599 EVT LoVT, HiVT; 11600 SDValue Lo, Hi; 11601 SDLoc dl(N); 11602 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); 11603 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); 11604 SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi); 11605 SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal); 11606 Results.push_back(SplitVal); 11607 } 11608 11609 static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) { 11610 SDLoc DL(N); 11611 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N); 11612 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, 11613 DAG.getNode(ISD::SRL, DL, MVT::i128, N, 11614 DAG.getConstant(64, DL, MVT::i64))); 11615 return std::make_pair(Lo, Hi); 11616 } 11617 11618 // Create an even/odd pair of X registers holding integer value V. 11619 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { 11620 SDLoc dl(V.getNode()); 11621 SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i64); 11622 SDValue VHi = DAG.getAnyExtOrTrunc( 11623 DAG.getNode(ISD::SRL, dl, MVT::i128, V, DAG.getConstant(64, dl, MVT::i64)), 11624 dl, MVT::i64); 11625 if (DAG.getDataLayout().isBigEndian()) 11626 std::swap (VLo, VHi); 11627 SDValue RegClass = 11628 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32); 11629 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32); 11630 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32); 11631 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 }; 11632 return SDValue( 11633 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0); 11634 } 11635 11636 static void ReplaceCMP_SWAP_128Results(SDNode *N, 11637 SmallVectorImpl<SDValue> &Results, 11638 SelectionDAG &DAG, 11639 const AArch64Subtarget *Subtarget) { 11640 assert(N->getValueType(0) == MVT::i128 && 11641 "AtomicCmpSwap on types less than 128 should be legal"); 11642 11643 if (Subtarget->hasLSE()) { 11644 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type, 11645 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG. 11646 SDValue Ops[] = { 11647 createGPRPairNode(DAG, N->getOperand(2)), // Compare value 11648 createGPRPairNode(DAG, N->getOperand(3)), // Store value 11649 N->getOperand(1), // Ptr 11650 N->getOperand(0), // Chain in 11651 }; 11652 11653 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand(); 11654 11655 unsigned Opcode; 11656 switch (MemOp->getOrdering()) { 11657 case AtomicOrdering::Monotonic: 11658 Opcode = AArch64::CASPX; 11659 break; 11660 case AtomicOrdering::Acquire: 11661 Opcode = AArch64::CASPAX; 11662 break; 11663 case AtomicOrdering::Release: 11664 Opcode = AArch64::CASPLX; 11665 break; 11666 case AtomicOrdering::AcquireRelease: 11667 case AtomicOrdering::SequentiallyConsistent: 11668 Opcode = AArch64::CASPALX; 11669 break; 11670 default: 11671 llvm_unreachable("Unexpected ordering!"); 11672 } 11673 11674 MachineSDNode *CmpSwap = DAG.getMachineNode( 11675 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops); 11676 DAG.setNodeMemRefs(CmpSwap, {MemOp}); 11677 11678 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64; 11679 if (DAG.getDataLayout().isBigEndian()) 11680 std::swap(SubReg1, SubReg2); 11681 Results.push_back(DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64, 11682 SDValue(CmpSwap, 0))); 11683 Results.push_back(DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64, 11684 SDValue(CmpSwap, 0))); 11685 Results.push_back(SDValue(CmpSwap, 1)); // Chain out 11686 return; 11687 } 11688 11689 auto Desired = splitInt128(N->getOperand(2), DAG); 11690 auto New = splitInt128(N->getOperand(3), DAG); 11691 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second, 11692 New.first, New.second, N->getOperand(0)}; 11693 SDNode *CmpSwap = DAG.getMachineNode( 11694 AArch64::CMP_SWAP_128, SDLoc(N), 11695 DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops); 11696 11697 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand(); 11698 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp}); 11699 11700 Results.push_back(SDValue(CmpSwap, 0)); 11701 Results.push_back(SDValue(CmpSwap, 1)); 11702 Results.push_back(SDValue(CmpSwap, 3)); 11703 } 11704 11705 void AArch64TargetLowering::ReplaceNodeResults( 11706 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { 11707 switch (N->getOpcode()) { 11708 default: 11709 llvm_unreachable("Don't know how to custom expand this"); 11710 case ISD::BITCAST: 11711 ReplaceBITCASTResults(N, Results, DAG); 11712 return; 11713 case ISD::VECREDUCE_ADD: 11714 case ISD::VECREDUCE_SMAX: 11715 case ISD::VECREDUCE_SMIN: 11716 case ISD::VECREDUCE_UMAX: 11717 case ISD::VECREDUCE_UMIN: 11718 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG)); 11719 return; 11720 11721 case AArch64ISD::SADDV: 11722 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV); 11723 return; 11724 case AArch64ISD::UADDV: 11725 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV); 11726 return; 11727 case AArch64ISD::SMINV: 11728 ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV); 11729 return; 11730 case AArch64ISD::UMINV: 11731 ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV); 11732 return; 11733 case AArch64ISD::SMAXV: 11734 ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV); 11735 return; 11736 case AArch64ISD::UMAXV: 11737 ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV); 11738 return; 11739 case ISD::FP_TO_UINT: 11740 case ISD::FP_TO_SINT: 11741 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion"); 11742 // Let normal code take care of it by not adding anything to Results. 11743 return; 11744 case ISD::ATOMIC_CMP_SWAP: 11745 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget); 11746 return; 11747 } 11748 } 11749 11750 bool AArch64TargetLowering::useLoadStackGuardNode() const { 11751 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia()) 11752 return TargetLowering::useLoadStackGuardNode(); 11753 return true; 11754 } 11755 11756 unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const { 11757 // Combine multiple FDIVs with the same divisor into multiple FMULs by the 11758 // reciprocal if there are three or more FDIVs. 11759 return 3; 11760 } 11761 11762 TargetLoweringBase::LegalizeTypeAction 11763 AArch64TargetLowering::getPreferredVectorAction(MVT VT) const { 11764 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8, 11765 // v4i16, v2i32 instead of to promote. 11766 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 || 11767 VT == MVT::v1f32) 11768 return TypeWidenVector; 11769 11770 return TargetLoweringBase::getPreferredVectorAction(VT); 11771 } 11772 11773 // Loads and stores less than 128-bits are already atomic; ones above that 11774 // are doomed anyway, so defer to the default libcall and blame the OS when 11775 // things go wrong. 11776 bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 11777 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); 11778 return Size == 128; 11779 } 11780 11781 // Loads and stores less than 128-bits are already atomic; ones above that 11782 // are doomed anyway, so defer to the default libcall and blame the OS when 11783 // things go wrong. 11784 TargetLowering::AtomicExpansionKind 11785 AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 11786 unsigned Size = LI->getType()->getPrimitiveSizeInBits(); 11787 return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; 11788 } 11789 11790 // For the real atomic operations, we have ldxr/stxr up to 128 bits, 11791 TargetLowering::AtomicExpansionKind 11792 AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 11793 if (AI->isFloatingPointOperation()) 11794 return AtomicExpansionKind::CmpXChg; 11795 11796 unsigned Size = AI->getType()->getPrimitiveSizeInBits(); 11797 if (Size > 128) return AtomicExpansionKind::None; 11798 // Nand not supported in LSE. 11799 if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC; 11800 // Leave 128 bits to LLSC. 11801 return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC; 11802 } 11803 11804 TargetLowering::AtomicExpansionKind 11805 AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR( 11806 AtomicCmpXchgInst *AI) const { 11807 // If subtarget has LSE, leave cmpxchg intact for codegen. 11808 if (Subtarget->hasLSE()) 11809 return AtomicExpansionKind::None; 11810 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 11811 // implement cmpxchg without spilling. If the address being exchanged is also 11812 // on the stack and close enough to the spill slot, this can lead to a 11813 // situation where the monitor always gets cleared and the atomic operation 11814 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. 11815 if (getTargetMachine().getOptLevel() == 0) 11816 return AtomicExpansionKind::None; 11817 return AtomicExpansionKind::LLSC; 11818 } 11819 11820 Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, 11821 AtomicOrdering Ord) const { 11822 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 11823 Type *ValTy = cast<PointerType>(Addr->getType())->getElementType(); 11824 bool IsAcquire = isAcquireOrStronger(Ord); 11825 11826 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd 11827 // intrinsic must return {i64, i64} and we have to recombine them into a 11828 // single i128 here. 11829 if (ValTy->getPrimitiveSizeInBits() == 128) { 11830 Intrinsic::ID Int = 11831 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp; 11832 Function *Ldxr = Intrinsic::getDeclaration(M, Int); 11833 11834 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 11835 Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi"); 11836 11837 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); 11838 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); 11839 Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); 11840 Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); 11841 return Builder.CreateOr( 11842 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64"); 11843 } 11844 11845 Type *Tys[] = { Addr->getType() }; 11846 Intrinsic::ID Int = 11847 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr; 11848 Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys); 11849 11850 Type *EltTy = cast<PointerType>(Addr->getType())->getElementType(); 11851 11852 const DataLayout &DL = M->getDataLayout(); 11853 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(EltTy)); 11854 Value *Trunc = Builder.CreateTrunc(Builder.CreateCall(Ldxr, Addr), IntEltTy); 11855 11856 return Builder.CreateBitCast(Trunc, EltTy); 11857 } 11858 11859 void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance( 11860 IRBuilder<> &Builder) const { 11861 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 11862 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex)); 11863 } 11864 11865 Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, 11866 Value *Val, Value *Addr, 11867 AtomicOrdering Ord) const { 11868 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 11869 bool IsRelease = isReleaseOrStronger(Ord); 11870 11871 // Since the intrinsics must have legal type, the i128 intrinsics take two 11872 // parameters: "i64, i64". We must marshal Val into the appropriate form 11873 // before the call. 11874 if (Val->getType()->getPrimitiveSizeInBits() == 128) { 11875 Intrinsic::ID Int = 11876 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp; 11877 Function *Stxr = Intrinsic::getDeclaration(M, Int); 11878 Type *Int64Ty = Type::getInt64Ty(M->getContext()); 11879 11880 Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo"); 11881 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi"); 11882 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 11883 return Builder.CreateCall(Stxr, {Lo, Hi, Addr}); 11884 } 11885 11886 Intrinsic::ID Int = 11887 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr; 11888 Type *Tys[] = { Addr->getType() }; 11889 Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys); 11890 11891 const DataLayout &DL = M->getDataLayout(); 11892 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType())); 11893 Val = Builder.CreateBitCast(Val, IntValTy); 11894 11895 return Builder.CreateCall(Stxr, 11896 {Builder.CreateZExtOrBitCast( 11897 Val, Stxr->getFunctionType()->getParamType(0)), 11898 Addr}); 11899 } 11900 11901 bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters( 11902 Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { 11903 return Ty->isArrayTy(); 11904 } 11905 11906 bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &, 11907 EVT) const { 11908 return false; 11909 } 11910 11911 static Value *UseTlsOffset(IRBuilder<> &IRB, unsigned Offset) { 11912 Module *M = IRB.GetInsertBlock()->getParent()->getParent(); 11913 Function *ThreadPointerFunc = 11914 Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); 11915 return IRB.CreatePointerCast( 11916 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc), 11917 Offset), 11918 IRB.getInt8PtrTy()->getPointerTo(0)); 11919 } 11920 11921 Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { 11922 // Android provides a fixed TLS slot for the stack cookie. See the definition 11923 // of TLS_SLOT_STACK_GUARD in 11924 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h 11925 if (Subtarget->isTargetAndroid()) 11926 return UseTlsOffset(IRB, 0x28); 11927 11928 // Fuchsia is similar. 11929 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value. 11930 if (Subtarget->isTargetFuchsia()) 11931 return UseTlsOffset(IRB, -0x10); 11932 11933 return TargetLowering::getIRStackGuard(IRB); 11934 } 11935 11936 void AArch64TargetLowering::insertSSPDeclarations(Module &M) const { 11937 // MSVC CRT provides functionalities for stack protection. 11938 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) { 11939 // MSVC CRT has a global variable holding security cookie. 11940 M.getOrInsertGlobal("__security_cookie", 11941 Type::getInt8PtrTy(M.getContext())); 11942 11943 // MSVC CRT has a function to validate security cookie. 11944 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( 11945 "__security_check_cookie", Type::getVoidTy(M.getContext()), 11946 Type::getInt8PtrTy(M.getContext())); 11947 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) { 11948 F->setCallingConv(CallingConv::Win64); 11949 F->addAttribute(1, Attribute::AttrKind::InReg); 11950 } 11951 return; 11952 } 11953 TargetLowering::insertSSPDeclarations(M); 11954 } 11955 11956 Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const { 11957 // MSVC CRT has a global variable holding security cookie. 11958 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 11959 return M.getGlobalVariable("__security_cookie"); 11960 return TargetLowering::getSDagStackGuard(M); 11961 } 11962 11963 Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const { 11964 // MSVC CRT has a function to validate security cookie. 11965 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 11966 return M.getFunction("__security_check_cookie"); 11967 return TargetLowering::getSSPStackGuardCheck(M); 11968 } 11969 11970 Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { 11971 // Android provides a fixed TLS slot for the SafeStack pointer. See the 11972 // definition of TLS_SLOT_SAFESTACK in 11973 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h 11974 if (Subtarget->isTargetAndroid()) 11975 return UseTlsOffset(IRB, 0x48); 11976 11977 // Fuchsia is similar. 11978 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value. 11979 if (Subtarget->isTargetFuchsia()) 11980 return UseTlsOffset(IRB, -0x8); 11981 11982 return TargetLowering::getSafeStackPointerLocation(IRB); 11983 } 11984 11985 bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial( 11986 const Instruction &AndI) const { 11987 // Only sink 'and' mask to cmp use block if it is masking a single bit, since 11988 // this is likely to be fold the and/cmp/br into a single tbz instruction. It 11989 // may be beneficial to sink in other cases, but we would have to check that 11990 // the cmp would not get folded into the br to form a cbz for these to be 11991 // beneficial. 11992 ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1)); 11993 if (!Mask) 11994 return false; 11995 return Mask->getValue().isPowerOf2(); 11996 } 11997 11998 bool AArch64TargetLowering::shouldExpandShift(SelectionDAG &DAG, 11999 SDNode *N) const { 12000 if (DAG.getMachineFunction().getFunction().hasMinSize() && 12001 !Subtarget->isTargetWindows()) 12002 return false; 12003 return true; 12004 } 12005 12006 void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { 12007 // Update IsSplitCSR in AArch64unctionInfo. 12008 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>(); 12009 AFI->setIsSplitCSR(true); 12010 } 12011 12012 void AArch64TargetLowering::insertCopiesSplitCSR( 12013 MachineBasicBlock *Entry, 12014 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 12015 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); 12016 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 12017 if (!IStart) 12018 return; 12019 12020 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 12021 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 12022 MachineBasicBlock::iterator MBBI = Entry->begin(); 12023 for (const MCPhysReg *I = IStart; *I; ++I) { 12024 const TargetRegisterClass *RC = nullptr; 12025 if (AArch64::GPR64RegClass.contains(*I)) 12026 RC = &AArch64::GPR64RegClass; 12027 else if (AArch64::FPR64RegClass.contains(*I)) 12028 RC = &AArch64::FPR64RegClass; 12029 else 12030 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 12031 12032 unsigned NewVR = MRI->createVirtualRegister(RC); 12033 // Create copy from CSR to a virtual register. 12034 // FIXME: this currently does not emit CFI pseudo-instructions, it works 12035 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be 12036 // nounwind. If we want to generalize this later, we may need to emit 12037 // CFI pseudo-instructions. 12038 assert(Entry->getParent()->getFunction().hasFnAttribute( 12039 Attribute::NoUnwind) && 12040 "Function should be nounwind in insertCopiesSplitCSR!"); 12041 Entry->addLiveIn(*I); 12042 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 12043 .addReg(*I); 12044 12045 // Insert the copy-back instructions right before the terminator. 12046 for (auto *Exit : Exits) 12047 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 12048 TII->get(TargetOpcode::COPY), *I) 12049 .addReg(NewVR); 12050 } 12051 } 12052 12053 bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { 12054 // Integer division on AArch64 is expensive. However, when aggressively 12055 // optimizing for code size, we prefer to use a div instruction, as it is 12056 // usually smaller than the alternative sequence. 12057 // The exception to this is vector division. Since AArch64 doesn't have vector 12058 // integer division, leaving the division as-is is a loss even in terms of 12059 // size, because it will have to be scalarized, while the alternative code 12060 // sequence can be performed in vector form. 12061 bool OptSize = 12062 Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize); 12063 return OptSize && !VT.isVector(); 12064 } 12065 12066 bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const { 12067 // We want inc-of-add for scalars and sub-of-not for vectors. 12068 return VT.isScalarInteger(); 12069 } 12070 12071 bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const { 12072 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint(); 12073 } 12074 12075 unsigned 12076 AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const { 12077 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows()) 12078 return getPointerTy(DL).getSizeInBits(); 12079 12080 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32; 12081 } 12082 12083 void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const { 12084 MF.getFrameInfo().computeMaxCallFrameSize(MF); 12085 TargetLoweringBase::finalizeLowering(MF); 12086 } 12087 12088 // Unlike X86, we let frame lowering assign offsets to all catch objects. 12089 bool AArch64TargetLowering::needsFixedCatchObjects() const { 12090 return false; 12091 } 12092