xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (revision bc5304a006238115291e7568583632889dffbab9)
1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation  ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64TargetLowering class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64ISelLowering.h"
14 #include "AArch64CallingConvention.h"
15 #include "AArch64ExpandImm.h"
16 #include "AArch64MachineFunctionInfo.h"
17 #include "AArch64PerfectShuffle.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
20 #include "MCTargetDesc/AArch64AddressingModes.h"
21 #include "Utils/AArch64BaseInfo.h"
22 #include "llvm/ADT/APFloat.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/SmallVector.h"
28 #include "llvm/ADT/Statistic.h"
29 #include "llvm/ADT/StringRef.h"
30 #include "llvm/ADT/Triple.h"
31 #include "llvm/ADT/Twine.h"
32 #include "llvm/Analysis/VectorUtils.h"
33 #include "llvm/CodeGen/CallingConvLower.h"
34 #include "llvm/CodeGen/MachineBasicBlock.h"
35 #include "llvm/CodeGen/MachineFrameInfo.h"
36 #include "llvm/CodeGen/MachineFunction.h"
37 #include "llvm/CodeGen/MachineInstr.h"
38 #include "llvm/CodeGen/MachineInstrBuilder.h"
39 #include "llvm/CodeGen/MachineMemOperand.h"
40 #include "llvm/CodeGen/MachineRegisterInfo.h"
41 #include "llvm/CodeGen/RuntimeLibcalls.h"
42 #include "llvm/CodeGen/SelectionDAG.h"
43 #include "llvm/CodeGen/SelectionDAGNodes.h"
44 #include "llvm/CodeGen/TargetCallingConv.h"
45 #include "llvm/CodeGen/TargetInstrInfo.h"
46 #include "llvm/CodeGen/ValueTypes.h"
47 #include "llvm/IR/Attributes.h"
48 #include "llvm/IR/Constants.h"
49 #include "llvm/IR/DataLayout.h"
50 #include "llvm/IR/DebugLoc.h"
51 #include "llvm/IR/DerivedTypes.h"
52 #include "llvm/IR/Function.h"
53 #include "llvm/IR/GetElementPtrTypeIterator.h"
54 #include "llvm/IR/GlobalValue.h"
55 #include "llvm/IR/IRBuilder.h"
56 #include "llvm/IR/Instruction.h"
57 #include "llvm/IR/Instructions.h"
58 #include "llvm/IR/IntrinsicInst.h"
59 #include "llvm/IR/Intrinsics.h"
60 #include "llvm/IR/IntrinsicsAArch64.h"
61 #include "llvm/IR/Module.h"
62 #include "llvm/IR/OperandTraits.h"
63 #include "llvm/IR/PatternMatch.h"
64 #include "llvm/IR/Type.h"
65 #include "llvm/IR/Use.h"
66 #include "llvm/IR/Value.h"
67 #include "llvm/MC/MCRegisterInfo.h"
68 #include "llvm/Support/Casting.h"
69 #include "llvm/Support/CodeGen.h"
70 #include "llvm/Support/CommandLine.h"
71 #include "llvm/Support/Compiler.h"
72 #include "llvm/Support/Debug.h"
73 #include "llvm/Support/ErrorHandling.h"
74 #include "llvm/Support/KnownBits.h"
75 #include "llvm/Support/MachineValueType.h"
76 #include "llvm/Support/MathExtras.h"
77 #include "llvm/Support/raw_ostream.h"
78 #include "llvm/Target/TargetMachine.h"
79 #include "llvm/Target/TargetOptions.h"
80 #include <algorithm>
81 #include <bitset>
82 #include <cassert>
83 #include <cctype>
84 #include <cstdint>
85 #include <cstdlib>
86 #include <iterator>
87 #include <limits>
88 #include <tuple>
89 #include <utility>
90 #include <vector>
91 
92 using namespace llvm;
93 using namespace llvm::PatternMatch;
94 
95 #define DEBUG_TYPE "aarch64-lower"
96 
97 STATISTIC(NumTailCalls, "Number of tail calls");
98 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
99 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
100 
101 // FIXME: The necessary dtprel relocations don't seem to be supported
102 // well in the GNU bfd and gold linkers at the moment. Therefore, by
103 // default, for now, fall back to GeneralDynamic code generation.
104 cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
105     "aarch64-elf-ldtls-generation", cl::Hidden,
106     cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
107     cl::init(false));
108 
109 static cl::opt<bool>
110 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
111                          cl::desc("Enable AArch64 logical imm instruction "
112                                   "optimization"),
113                          cl::init(true));
114 
115 // Temporary option added for the purpose of testing functionality added
116 // to DAGCombiner.cpp in D92230. It is expected that this can be removed
117 // in future when both implementations will be based off MGATHER rather
118 // than the GLD1 nodes added for the SVE gather load intrinsics.
119 static cl::opt<bool>
120 EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
121                                 cl::desc("Combine extends of AArch64 masked "
122                                          "gather intrinsics"),
123                                 cl::init(true));
124 
125 /// Value type used for condition codes.
126 static const MVT MVT_CC = MVT::i32;
127 
128 static inline EVT getPackedSVEVectorVT(EVT VT) {
129   switch (VT.getSimpleVT().SimpleTy) {
130   default:
131     llvm_unreachable("unexpected element type for vector");
132   case MVT::i8:
133     return MVT::nxv16i8;
134   case MVT::i16:
135     return MVT::nxv8i16;
136   case MVT::i32:
137     return MVT::nxv4i32;
138   case MVT::i64:
139     return MVT::nxv2i64;
140   case MVT::f16:
141     return MVT::nxv8f16;
142   case MVT::f32:
143     return MVT::nxv4f32;
144   case MVT::f64:
145     return MVT::nxv2f64;
146   case MVT::bf16:
147     return MVT::nxv8bf16;
148   }
149 }
150 
151 // NOTE: Currently there's only a need to return integer vector types. If this
152 // changes then just add an extra "type" parameter.
153 static inline EVT getPackedSVEVectorVT(ElementCount EC) {
154   switch (EC.getKnownMinValue()) {
155   default:
156     llvm_unreachable("unexpected element count for vector");
157   case 16:
158     return MVT::nxv16i8;
159   case 8:
160     return MVT::nxv8i16;
161   case 4:
162     return MVT::nxv4i32;
163   case 2:
164     return MVT::nxv2i64;
165   }
166 }
167 
168 static inline EVT getPromotedVTForPredicate(EVT VT) {
169   assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
170          "Expected scalable predicate vector type!");
171   switch (VT.getVectorMinNumElements()) {
172   default:
173     llvm_unreachable("unexpected element count for vector");
174   case 2:
175     return MVT::nxv2i64;
176   case 4:
177     return MVT::nxv4i32;
178   case 8:
179     return MVT::nxv8i16;
180   case 16:
181     return MVT::nxv16i8;
182   }
183 }
184 
185 /// Returns true if VT's elements occupy the lowest bit positions of its
186 /// associated register class without any intervening space.
187 ///
188 /// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
189 /// same register class, but only nxv8f16 can be treated as a packed vector.
190 static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
191   assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
192          "Expected legal vector type!");
193   return VT.isFixedLengthVector() ||
194          VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock;
195 }
196 
197 // Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
198 // predicate and end with a passthru value matching the result type.
199 static bool isMergePassthruOpcode(unsigned Opc) {
200   switch (Opc) {
201   default:
202     return false;
203   case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
204   case AArch64ISD::BSWAP_MERGE_PASSTHRU:
205   case AArch64ISD::CTLZ_MERGE_PASSTHRU:
206   case AArch64ISD::CTPOP_MERGE_PASSTHRU:
207   case AArch64ISD::DUP_MERGE_PASSTHRU:
208   case AArch64ISD::ABS_MERGE_PASSTHRU:
209   case AArch64ISD::NEG_MERGE_PASSTHRU:
210   case AArch64ISD::FNEG_MERGE_PASSTHRU:
211   case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
212   case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
213   case AArch64ISD::FCEIL_MERGE_PASSTHRU:
214   case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
215   case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
216   case AArch64ISD::FRINT_MERGE_PASSTHRU:
217   case AArch64ISD::FROUND_MERGE_PASSTHRU:
218   case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
219   case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
220   case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
221   case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
222   case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
223   case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
224   case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
225   case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
226   case AArch64ISD::FSQRT_MERGE_PASSTHRU:
227   case AArch64ISD::FRECPX_MERGE_PASSTHRU:
228   case AArch64ISD::FABS_MERGE_PASSTHRU:
229     return true;
230   }
231 }
232 
233 AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
234                                              const AArch64Subtarget &STI)
235     : TargetLowering(TM), Subtarget(&STI) {
236   // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
237   // we have to make something up. Arbitrarily, choose ZeroOrOne.
238   setBooleanContents(ZeroOrOneBooleanContent);
239   // When comparing vectors the result sets the different elements in the
240   // vector to all-one or all-zero.
241   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
242 
243   // Set up the register classes.
244   addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
245   addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
246 
247   if (Subtarget->hasFPARMv8()) {
248     addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
249     addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
250     addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
251     addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
252     addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
253   }
254 
255   if (Subtarget->hasNEON()) {
256     addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
257     addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
258     // Someone set us up the NEON.
259     addDRTypeForNEON(MVT::v2f32);
260     addDRTypeForNEON(MVT::v8i8);
261     addDRTypeForNEON(MVT::v4i16);
262     addDRTypeForNEON(MVT::v2i32);
263     addDRTypeForNEON(MVT::v1i64);
264     addDRTypeForNEON(MVT::v1f64);
265     addDRTypeForNEON(MVT::v4f16);
266     if (Subtarget->hasBF16())
267       addDRTypeForNEON(MVT::v4bf16);
268 
269     addQRTypeForNEON(MVT::v4f32);
270     addQRTypeForNEON(MVT::v2f64);
271     addQRTypeForNEON(MVT::v16i8);
272     addQRTypeForNEON(MVT::v8i16);
273     addQRTypeForNEON(MVT::v4i32);
274     addQRTypeForNEON(MVT::v2i64);
275     addQRTypeForNEON(MVT::v8f16);
276     if (Subtarget->hasBF16())
277       addQRTypeForNEON(MVT::v8bf16);
278   }
279 
280   if (Subtarget->hasSVE()) {
281     // Add legal sve predicate types
282     addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
283     addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
284     addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
285     addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
286 
287     // Add legal sve data types
288     addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
289     addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
290     addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
291     addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
292 
293     addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
294     addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
295     addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
296     addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
297     addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
298     addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
299 
300     if (Subtarget->hasBF16()) {
301       addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
302       addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
303       addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
304     }
305 
306     if (Subtarget->useSVEForFixedLengthVectors()) {
307       for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
308         if (useSVEForFixedLengthVectorVT(VT))
309           addRegisterClass(VT, &AArch64::ZPRRegClass);
310 
311       for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
312         if (useSVEForFixedLengthVectorVT(VT))
313           addRegisterClass(VT, &AArch64::ZPRRegClass);
314     }
315 
316     for (auto VT : { MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64 }) {
317       setOperationAction(ISD::SADDSAT, VT, Legal);
318       setOperationAction(ISD::UADDSAT, VT, Legal);
319       setOperationAction(ISD::SSUBSAT, VT, Legal);
320       setOperationAction(ISD::USUBSAT, VT, Legal);
321       setOperationAction(ISD::UREM, VT, Expand);
322       setOperationAction(ISD::SREM, VT, Expand);
323       setOperationAction(ISD::SDIVREM, VT, Expand);
324       setOperationAction(ISD::UDIVREM, VT, Expand);
325     }
326 
327     for (auto VT :
328          { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
329            MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
330       setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal);
331 
332     for (auto VT :
333          { MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, MVT::nxv4f32,
334            MVT::nxv2f64 }) {
335       setCondCodeAction(ISD::SETO, VT, Expand);
336       setCondCodeAction(ISD::SETOLT, VT, Expand);
337       setCondCodeAction(ISD::SETLT, VT, Expand);
338       setCondCodeAction(ISD::SETOLE, VT, Expand);
339       setCondCodeAction(ISD::SETLE, VT, Expand);
340       setCondCodeAction(ISD::SETULT, VT, Expand);
341       setCondCodeAction(ISD::SETULE, VT, Expand);
342       setCondCodeAction(ISD::SETUGE, VT, Expand);
343       setCondCodeAction(ISD::SETUGT, VT, Expand);
344       setCondCodeAction(ISD::SETUEQ, VT, Expand);
345       setCondCodeAction(ISD::SETUNE, VT, Expand);
346     }
347   }
348 
349   // Compute derived properties from the register classes
350   computeRegisterProperties(Subtarget->getRegisterInfo());
351 
352   // Provide all sorts of operation actions
353   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
354   setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
355   setOperationAction(ISD::SETCC, MVT::i32, Custom);
356   setOperationAction(ISD::SETCC, MVT::i64, Custom);
357   setOperationAction(ISD::SETCC, MVT::f16, Custom);
358   setOperationAction(ISD::SETCC, MVT::f32, Custom);
359   setOperationAction(ISD::SETCC, MVT::f64, Custom);
360   setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
361   setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom);
362   setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom);
363   setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
364   setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom);
365   setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
366   setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
367   setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
368   setOperationAction(ISD::BRCOND, MVT::Other, Expand);
369   setOperationAction(ISD::BR_CC, MVT::i32, Custom);
370   setOperationAction(ISD::BR_CC, MVT::i64, Custom);
371   setOperationAction(ISD::BR_CC, MVT::f16, Custom);
372   setOperationAction(ISD::BR_CC, MVT::f32, Custom);
373   setOperationAction(ISD::BR_CC, MVT::f64, Custom);
374   setOperationAction(ISD::SELECT, MVT::i32, Custom);
375   setOperationAction(ISD::SELECT, MVT::i64, Custom);
376   setOperationAction(ISD::SELECT, MVT::f16, Custom);
377   setOperationAction(ISD::SELECT, MVT::f32, Custom);
378   setOperationAction(ISD::SELECT, MVT::f64, Custom);
379   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
380   setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
381   setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
382   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
383   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
384   setOperationAction(ISD::BR_JT, MVT::Other, Custom);
385   setOperationAction(ISD::JumpTable, MVT::i64, Custom);
386 
387   setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
388   setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
389   setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
390 
391   setOperationAction(ISD::FREM, MVT::f32, Expand);
392   setOperationAction(ISD::FREM, MVT::f64, Expand);
393   setOperationAction(ISD::FREM, MVT::f80, Expand);
394 
395   setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
396 
397   // Custom lowering hooks are needed for XOR
398   // to fold it into CSINC/CSINV.
399   setOperationAction(ISD::XOR, MVT::i32, Custom);
400   setOperationAction(ISD::XOR, MVT::i64, Custom);
401 
402   // Virtually no operation on f128 is legal, but LLVM can't expand them when
403   // there's a valid register class, so we need custom operations in most cases.
404   setOperationAction(ISD::FABS, MVT::f128, Expand);
405   setOperationAction(ISD::FADD, MVT::f128, LibCall);
406   setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
407   setOperationAction(ISD::FCOS, MVT::f128, Expand);
408   setOperationAction(ISD::FDIV, MVT::f128, LibCall);
409   setOperationAction(ISD::FMA, MVT::f128, Expand);
410   setOperationAction(ISD::FMUL, MVT::f128, LibCall);
411   setOperationAction(ISD::FNEG, MVT::f128, Expand);
412   setOperationAction(ISD::FPOW, MVT::f128, Expand);
413   setOperationAction(ISD::FREM, MVT::f128, Expand);
414   setOperationAction(ISD::FRINT, MVT::f128, Expand);
415   setOperationAction(ISD::FSIN, MVT::f128, Expand);
416   setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
417   setOperationAction(ISD::FSQRT, MVT::f128, Expand);
418   setOperationAction(ISD::FSUB, MVT::f128, LibCall);
419   setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
420   setOperationAction(ISD::SETCC, MVT::f128, Custom);
421   setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom);
422   setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom);
423   setOperationAction(ISD::BR_CC, MVT::f128, Custom);
424   setOperationAction(ISD::SELECT, MVT::f128, Custom);
425   setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
426   setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
427 
428   // Lowering for many of the conversions is actually specified by the non-f128
429   // type. The LowerXXX function will be trivial when f128 isn't involved.
430   setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
431   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
432   setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
433   setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
434   setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
435   setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
436   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
437   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
438   setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
439   setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
440   setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
441   setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
442   setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
443   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
444   setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
445   setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
446   setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
447   setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
448   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
449   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
450   setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
451   setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
452   setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
453   setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
454   setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
455   setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
456   setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
457   setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
458   setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
459   setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
460 
461   // Variable arguments.
462   setOperationAction(ISD::VASTART, MVT::Other, Custom);
463   setOperationAction(ISD::VAARG, MVT::Other, Custom);
464   setOperationAction(ISD::VACOPY, MVT::Other, Custom);
465   setOperationAction(ISD::VAEND, MVT::Other, Expand);
466 
467   // Variable-sized objects.
468   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
469   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
470 
471   if (Subtarget->isTargetWindows())
472     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
473   else
474     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
475 
476   // Constant pool entries
477   setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
478 
479   // BlockAddress
480   setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
481 
482   // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
483   setOperationAction(ISD::ADDC, MVT::i32, Custom);
484   setOperationAction(ISD::ADDE, MVT::i32, Custom);
485   setOperationAction(ISD::SUBC, MVT::i32, Custom);
486   setOperationAction(ISD::SUBE, MVT::i32, Custom);
487   setOperationAction(ISD::ADDC, MVT::i64, Custom);
488   setOperationAction(ISD::ADDE, MVT::i64, Custom);
489   setOperationAction(ISD::SUBC, MVT::i64, Custom);
490   setOperationAction(ISD::SUBE, MVT::i64, Custom);
491 
492   // AArch64 lacks both left-rotate and popcount instructions.
493   setOperationAction(ISD::ROTL, MVT::i32, Expand);
494   setOperationAction(ISD::ROTL, MVT::i64, Expand);
495   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
496     setOperationAction(ISD::ROTL, VT, Expand);
497     setOperationAction(ISD::ROTR, VT, Expand);
498   }
499 
500   // AArch64 doesn't have i32 MULH{S|U}.
501   setOperationAction(ISD::MULHU, MVT::i32, Expand);
502   setOperationAction(ISD::MULHS, MVT::i32, Expand);
503 
504   // AArch64 doesn't have {U|S}MUL_LOHI.
505   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
506   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
507 
508   setOperationAction(ISD::CTPOP, MVT::i32, Custom);
509   setOperationAction(ISD::CTPOP, MVT::i64, Custom);
510   setOperationAction(ISD::CTPOP, MVT::i128, Custom);
511 
512   setOperationAction(ISD::ABS, MVT::i32, Custom);
513   setOperationAction(ISD::ABS, MVT::i64, Custom);
514 
515   setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
516   setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
517   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
518     setOperationAction(ISD::SDIVREM, VT, Expand);
519     setOperationAction(ISD::UDIVREM, VT, Expand);
520   }
521   setOperationAction(ISD::SREM, MVT::i32, Expand);
522   setOperationAction(ISD::SREM, MVT::i64, Expand);
523   setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
524   setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
525   setOperationAction(ISD::UREM, MVT::i32, Expand);
526   setOperationAction(ISD::UREM, MVT::i64, Expand);
527 
528   // Custom lower Add/Sub/Mul with overflow.
529   setOperationAction(ISD::SADDO, MVT::i32, Custom);
530   setOperationAction(ISD::SADDO, MVT::i64, Custom);
531   setOperationAction(ISD::UADDO, MVT::i32, Custom);
532   setOperationAction(ISD::UADDO, MVT::i64, Custom);
533   setOperationAction(ISD::SSUBO, MVT::i32, Custom);
534   setOperationAction(ISD::SSUBO, MVT::i64, Custom);
535   setOperationAction(ISD::USUBO, MVT::i32, Custom);
536   setOperationAction(ISD::USUBO, MVT::i64, Custom);
537   setOperationAction(ISD::SMULO, MVT::i32, Custom);
538   setOperationAction(ISD::SMULO, MVT::i64, Custom);
539   setOperationAction(ISD::UMULO, MVT::i32, Custom);
540   setOperationAction(ISD::UMULO, MVT::i64, Custom);
541 
542   setOperationAction(ISD::FSIN, MVT::f32, Expand);
543   setOperationAction(ISD::FSIN, MVT::f64, Expand);
544   setOperationAction(ISD::FCOS, MVT::f32, Expand);
545   setOperationAction(ISD::FCOS, MVT::f64, Expand);
546   setOperationAction(ISD::FPOW, MVT::f32, Expand);
547   setOperationAction(ISD::FPOW, MVT::f64, Expand);
548   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
549   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
550   if (Subtarget->hasFullFP16())
551     setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom);
552   else
553     setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
554 
555   setOperationAction(ISD::FREM,    MVT::f16,   Promote);
556   setOperationAction(ISD::FREM,    MVT::v4f16, Expand);
557   setOperationAction(ISD::FREM,    MVT::v8f16, Expand);
558   setOperationAction(ISD::FPOW,    MVT::f16,   Promote);
559   setOperationAction(ISD::FPOW,    MVT::v4f16, Expand);
560   setOperationAction(ISD::FPOW,    MVT::v8f16, Expand);
561   setOperationAction(ISD::FPOWI,   MVT::f16,   Promote);
562   setOperationAction(ISD::FPOWI,   MVT::v4f16, Expand);
563   setOperationAction(ISD::FPOWI,   MVT::v8f16, Expand);
564   setOperationAction(ISD::FCOS,    MVT::f16,   Promote);
565   setOperationAction(ISD::FCOS,    MVT::v4f16, Expand);
566   setOperationAction(ISD::FCOS,    MVT::v8f16, Expand);
567   setOperationAction(ISD::FSIN,    MVT::f16,   Promote);
568   setOperationAction(ISD::FSIN,    MVT::v4f16, Expand);
569   setOperationAction(ISD::FSIN,    MVT::v8f16, Expand);
570   setOperationAction(ISD::FSINCOS, MVT::f16,   Promote);
571   setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand);
572   setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand);
573   setOperationAction(ISD::FEXP,    MVT::f16,   Promote);
574   setOperationAction(ISD::FEXP,    MVT::v4f16, Expand);
575   setOperationAction(ISD::FEXP,    MVT::v8f16, Expand);
576   setOperationAction(ISD::FEXP2,   MVT::f16,   Promote);
577   setOperationAction(ISD::FEXP2,   MVT::v4f16, Expand);
578   setOperationAction(ISD::FEXP2,   MVT::v8f16, Expand);
579   setOperationAction(ISD::FLOG,    MVT::f16,   Promote);
580   setOperationAction(ISD::FLOG,    MVT::v4f16, Expand);
581   setOperationAction(ISD::FLOG,    MVT::v8f16, Expand);
582   setOperationAction(ISD::FLOG2,   MVT::f16,   Promote);
583   setOperationAction(ISD::FLOG2,   MVT::v4f16, Expand);
584   setOperationAction(ISD::FLOG2,   MVT::v8f16, Expand);
585   setOperationAction(ISD::FLOG10,  MVT::f16,   Promote);
586   setOperationAction(ISD::FLOG10,  MVT::v4f16, Expand);
587   setOperationAction(ISD::FLOG10,  MVT::v8f16, Expand);
588 
589   if (!Subtarget->hasFullFP16()) {
590     setOperationAction(ISD::SELECT,      MVT::f16,  Promote);
591     setOperationAction(ISD::SELECT_CC,   MVT::f16,  Promote);
592     setOperationAction(ISD::SETCC,       MVT::f16,  Promote);
593     setOperationAction(ISD::BR_CC,       MVT::f16,  Promote);
594     setOperationAction(ISD::FADD,        MVT::f16,  Promote);
595     setOperationAction(ISD::FSUB,        MVT::f16,  Promote);
596     setOperationAction(ISD::FMUL,        MVT::f16,  Promote);
597     setOperationAction(ISD::FDIV,        MVT::f16,  Promote);
598     setOperationAction(ISD::FMA,         MVT::f16,  Promote);
599     setOperationAction(ISD::FNEG,        MVT::f16,  Promote);
600     setOperationAction(ISD::FABS,        MVT::f16,  Promote);
601     setOperationAction(ISD::FCEIL,       MVT::f16,  Promote);
602     setOperationAction(ISD::FSQRT,       MVT::f16,  Promote);
603     setOperationAction(ISD::FFLOOR,      MVT::f16,  Promote);
604     setOperationAction(ISD::FNEARBYINT,  MVT::f16,  Promote);
605     setOperationAction(ISD::FRINT,       MVT::f16,  Promote);
606     setOperationAction(ISD::FROUND,      MVT::f16,  Promote);
607     setOperationAction(ISD::FTRUNC,      MVT::f16,  Promote);
608     setOperationAction(ISD::FMINNUM,     MVT::f16,  Promote);
609     setOperationAction(ISD::FMAXNUM,     MVT::f16,  Promote);
610     setOperationAction(ISD::FMINIMUM,    MVT::f16,  Promote);
611     setOperationAction(ISD::FMAXIMUM,    MVT::f16,  Promote);
612 
613     // promote v4f16 to v4f32 when that is known to be safe.
614     setOperationAction(ISD::FADD,        MVT::v4f16, Promote);
615     setOperationAction(ISD::FSUB,        MVT::v4f16, Promote);
616     setOperationAction(ISD::FMUL,        MVT::v4f16, Promote);
617     setOperationAction(ISD::FDIV,        MVT::v4f16, Promote);
618     AddPromotedToType(ISD::FADD,         MVT::v4f16, MVT::v4f32);
619     AddPromotedToType(ISD::FSUB,         MVT::v4f16, MVT::v4f32);
620     AddPromotedToType(ISD::FMUL,         MVT::v4f16, MVT::v4f32);
621     AddPromotedToType(ISD::FDIV,         MVT::v4f16, MVT::v4f32);
622 
623     setOperationAction(ISD::FABS,        MVT::v4f16, Expand);
624     setOperationAction(ISD::FNEG,        MVT::v4f16, Expand);
625     setOperationAction(ISD::FROUND,      MVT::v4f16, Expand);
626     setOperationAction(ISD::FMA,         MVT::v4f16, Expand);
627     setOperationAction(ISD::SETCC,       MVT::v4f16, Expand);
628     setOperationAction(ISD::BR_CC,       MVT::v4f16, Expand);
629     setOperationAction(ISD::SELECT,      MVT::v4f16, Expand);
630     setOperationAction(ISD::SELECT_CC,   MVT::v4f16, Expand);
631     setOperationAction(ISD::FTRUNC,      MVT::v4f16, Expand);
632     setOperationAction(ISD::FCOPYSIGN,   MVT::v4f16, Expand);
633     setOperationAction(ISD::FFLOOR,      MVT::v4f16, Expand);
634     setOperationAction(ISD::FCEIL,       MVT::v4f16, Expand);
635     setOperationAction(ISD::FRINT,       MVT::v4f16, Expand);
636     setOperationAction(ISD::FNEARBYINT,  MVT::v4f16, Expand);
637     setOperationAction(ISD::FSQRT,       MVT::v4f16, Expand);
638 
639     setOperationAction(ISD::FABS,        MVT::v8f16, Expand);
640     setOperationAction(ISD::FADD,        MVT::v8f16, Expand);
641     setOperationAction(ISD::FCEIL,       MVT::v8f16, Expand);
642     setOperationAction(ISD::FCOPYSIGN,   MVT::v8f16, Expand);
643     setOperationAction(ISD::FDIV,        MVT::v8f16, Expand);
644     setOperationAction(ISD::FFLOOR,      MVT::v8f16, Expand);
645     setOperationAction(ISD::FMA,         MVT::v8f16, Expand);
646     setOperationAction(ISD::FMUL,        MVT::v8f16, Expand);
647     setOperationAction(ISD::FNEARBYINT,  MVT::v8f16, Expand);
648     setOperationAction(ISD::FNEG,        MVT::v8f16, Expand);
649     setOperationAction(ISD::FROUND,      MVT::v8f16, Expand);
650     setOperationAction(ISD::FRINT,       MVT::v8f16, Expand);
651     setOperationAction(ISD::FSQRT,       MVT::v8f16, Expand);
652     setOperationAction(ISD::FSUB,        MVT::v8f16, Expand);
653     setOperationAction(ISD::FTRUNC,      MVT::v8f16, Expand);
654     setOperationAction(ISD::SETCC,       MVT::v8f16, Expand);
655     setOperationAction(ISD::BR_CC,       MVT::v8f16, Expand);
656     setOperationAction(ISD::SELECT,      MVT::v8f16, Expand);
657     setOperationAction(ISD::SELECT_CC,   MVT::v8f16, Expand);
658     setOperationAction(ISD::FP_EXTEND,   MVT::v8f16, Expand);
659   }
660 
661   // AArch64 has implementations of a lot of rounding-like FP operations.
662   for (MVT Ty : {MVT::f32, MVT::f64}) {
663     setOperationAction(ISD::FFLOOR, Ty, Legal);
664     setOperationAction(ISD::FNEARBYINT, Ty, Legal);
665     setOperationAction(ISD::FCEIL, Ty, Legal);
666     setOperationAction(ISD::FRINT, Ty, Legal);
667     setOperationAction(ISD::FTRUNC, Ty, Legal);
668     setOperationAction(ISD::FROUND, Ty, Legal);
669     setOperationAction(ISD::FMINNUM, Ty, Legal);
670     setOperationAction(ISD::FMAXNUM, Ty, Legal);
671     setOperationAction(ISD::FMINIMUM, Ty, Legal);
672     setOperationAction(ISD::FMAXIMUM, Ty, Legal);
673     setOperationAction(ISD::LROUND, Ty, Legal);
674     setOperationAction(ISD::LLROUND, Ty, Legal);
675     setOperationAction(ISD::LRINT, Ty, Legal);
676     setOperationAction(ISD::LLRINT, Ty, Legal);
677   }
678 
679   if (Subtarget->hasFullFP16()) {
680     setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal);
681     setOperationAction(ISD::FFLOOR,  MVT::f16, Legal);
682     setOperationAction(ISD::FCEIL,   MVT::f16, Legal);
683     setOperationAction(ISD::FRINT,   MVT::f16, Legal);
684     setOperationAction(ISD::FTRUNC,  MVT::f16, Legal);
685     setOperationAction(ISD::FROUND,  MVT::f16, Legal);
686     setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
687     setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
688     setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
689     setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
690   }
691 
692   setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
693 
694   setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
695 
696   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
697   setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
698   setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
699   setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
700   setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
701 
702   // Generate outline atomics library calls only if LSE was not specified for
703   // subtarget
704   if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
705     setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, LibCall);
706     setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, LibCall);
707     setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);
708     setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, LibCall);
709     setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, LibCall);
710     setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, LibCall);
711     setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, LibCall);
712     setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);
713     setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, LibCall);
714     setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, LibCall);
715     setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, LibCall);
716     setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);
717     setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, LibCall);
718     setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, LibCall);
719     setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, LibCall);
720     setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);
721     setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, LibCall);
722     setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i8, LibCall);
723     setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i16, LibCall);
724     setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i32, LibCall);
725     setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i64, LibCall);
726     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, LibCall);
727     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall);
728     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
729     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall);
730 #define LCALLNAMES(A, B, N)                                                    \
731   setLibcallName(A##N##_RELAX, #B #N "_relax");                                \
732   setLibcallName(A##N##_ACQ, #B #N "_acq");                                    \
733   setLibcallName(A##N##_REL, #B #N "_rel");                                    \
734   setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
735 #define LCALLNAME4(A, B)                                                       \
736   LCALLNAMES(A, B, 1)                                                          \
737   LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
738 #define LCALLNAME5(A, B)                                                       \
739   LCALLNAMES(A, B, 1)                                                          \
740   LCALLNAMES(A, B, 2)                                                          \
741   LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
742     LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
743     LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
744     LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
745     LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
746     LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
747     LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
748 #undef LCALLNAMES
749 #undef LCALLNAME4
750 #undef LCALLNAME5
751   }
752 
753   // 128-bit loads and stores can be done without expanding
754   setOperationAction(ISD::LOAD, MVT::i128, Custom);
755   setOperationAction(ISD::STORE, MVT::i128, Custom);
756 
757   // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
758   // custom lowering, as there are no un-paired non-temporal stores and
759   // legalization will break up 256 bit inputs.
760   setOperationAction(ISD::STORE, MVT::v32i8, Custom);
761   setOperationAction(ISD::STORE, MVT::v16i16, Custom);
762   setOperationAction(ISD::STORE, MVT::v16f16, Custom);
763   setOperationAction(ISD::STORE, MVT::v8i32, Custom);
764   setOperationAction(ISD::STORE, MVT::v8f32, Custom);
765   setOperationAction(ISD::STORE, MVT::v4f64, Custom);
766   setOperationAction(ISD::STORE, MVT::v4i64, Custom);
767 
768   // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
769   // This requires the Performance Monitors extension.
770   if (Subtarget->hasPerfMon())
771     setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
772 
773   if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
774       getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
775     // Issue __sincos_stret if available.
776     setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
777     setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
778   } else {
779     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
780     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
781   }
782 
783   if (Subtarget->getTargetTriple().isOSMSVCRT()) {
784     // MSVCRT doesn't have powi; fall back to pow
785     setLibcallName(RTLIB::POWI_F32, nullptr);
786     setLibcallName(RTLIB::POWI_F64, nullptr);
787   }
788 
789   // Make floating-point constants legal for the large code model, so they don't
790   // become loads from the constant pool.
791   if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
792     setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
793     setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
794   }
795 
796   // AArch64 does not have floating-point extending loads, i1 sign-extending
797   // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
798   for (MVT VT : MVT::fp_valuetypes()) {
799     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
800     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
801     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
802     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
803   }
804   for (MVT VT : MVT::integer_valuetypes())
805     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
806 
807   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
808   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
809   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
810   setTruncStoreAction(MVT::f128, MVT::f80, Expand);
811   setTruncStoreAction(MVT::f128, MVT::f64, Expand);
812   setTruncStoreAction(MVT::f128, MVT::f32, Expand);
813   setTruncStoreAction(MVT::f128, MVT::f16, Expand);
814 
815   setOperationAction(ISD::BITCAST, MVT::i16, Custom);
816   setOperationAction(ISD::BITCAST, MVT::f16, Custom);
817   setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
818 
819   // Indexed loads and stores are supported.
820   for (unsigned im = (unsigned)ISD::PRE_INC;
821        im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
822     setIndexedLoadAction(im, MVT::i8, Legal);
823     setIndexedLoadAction(im, MVT::i16, Legal);
824     setIndexedLoadAction(im, MVT::i32, Legal);
825     setIndexedLoadAction(im, MVT::i64, Legal);
826     setIndexedLoadAction(im, MVT::f64, Legal);
827     setIndexedLoadAction(im, MVT::f32, Legal);
828     setIndexedLoadAction(im, MVT::f16, Legal);
829     setIndexedLoadAction(im, MVT::bf16, Legal);
830     setIndexedStoreAction(im, MVT::i8, Legal);
831     setIndexedStoreAction(im, MVT::i16, Legal);
832     setIndexedStoreAction(im, MVT::i32, Legal);
833     setIndexedStoreAction(im, MVT::i64, Legal);
834     setIndexedStoreAction(im, MVT::f64, Legal);
835     setIndexedStoreAction(im, MVT::f32, Legal);
836     setIndexedStoreAction(im, MVT::f16, Legal);
837     setIndexedStoreAction(im, MVT::bf16, Legal);
838   }
839 
840   // Trap.
841   setOperationAction(ISD::TRAP, MVT::Other, Legal);
842   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
843   setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
844 
845   // We combine OR nodes for bitfield operations.
846   setTargetDAGCombine(ISD::OR);
847   // Try to create BICs for vector ANDs.
848   setTargetDAGCombine(ISD::AND);
849 
850   // Vector add and sub nodes may conceal a high-half opportunity.
851   // Also, try to fold ADD into CSINC/CSINV..
852   setTargetDAGCombine(ISD::ADD);
853   setTargetDAGCombine(ISD::ABS);
854   setTargetDAGCombine(ISD::SUB);
855   setTargetDAGCombine(ISD::SRL);
856   setTargetDAGCombine(ISD::XOR);
857   setTargetDAGCombine(ISD::SINT_TO_FP);
858   setTargetDAGCombine(ISD::UINT_TO_FP);
859 
860   setTargetDAGCombine(ISD::FP_TO_SINT);
861   setTargetDAGCombine(ISD::FP_TO_UINT);
862   setTargetDAGCombine(ISD::FDIV);
863 
864   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
865 
866   setTargetDAGCombine(ISD::ANY_EXTEND);
867   setTargetDAGCombine(ISD::ZERO_EXTEND);
868   setTargetDAGCombine(ISD::SIGN_EXTEND);
869   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
870   setTargetDAGCombine(ISD::TRUNCATE);
871   setTargetDAGCombine(ISD::CONCAT_VECTORS);
872   setTargetDAGCombine(ISD::STORE);
873   if (Subtarget->supportsAddressTopByteIgnored())
874     setTargetDAGCombine(ISD::LOAD);
875 
876   setTargetDAGCombine(ISD::MGATHER);
877   setTargetDAGCombine(ISD::MSCATTER);
878 
879   setTargetDAGCombine(ISD::MUL);
880 
881   setTargetDAGCombine(ISD::SELECT);
882   setTargetDAGCombine(ISD::VSELECT);
883 
884   setTargetDAGCombine(ISD::INTRINSIC_VOID);
885   setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
886   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
887   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
888   setTargetDAGCombine(ISD::VECREDUCE_ADD);
889 
890   setTargetDAGCombine(ISD::GlobalAddress);
891 
892   // In case of strict alignment, avoid an excessive number of byte wide stores.
893   MaxStoresPerMemsetOptSize = 8;
894   MaxStoresPerMemset = Subtarget->requiresStrictAlign()
895                        ? MaxStoresPerMemsetOptSize : 32;
896 
897   MaxGluedStoresPerMemcpy = 4;
898   MaxStoresPerMemcpyOptSize = 4;
899   MaxStoresPerMemcpy = Subtarget->requiresStrictAlign()
900                        ? MaxStoresPerMemcpyOptSize : 16;
901 
902   MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4;
903 
904   MaxLoadsPerMemcmpOptSize = 4;
905   MaxLoadsPerMemcmp = Subtarget->requiresStrictAlign()
906                       ? MaxLoadsPerMemcmpOptSize : 8;
907 
908   setStackPointerRegisterToSaveRestore(AArch64::SP);
909 
910   setSchedulingPreference(Sched::Hybrid);
911 
912   EnableExtLdPromotion = true;
913 
914   // Set required alignment.
915   setMinFunctionAlignment(Align(4));
916   // Set preferred alignments.
917   setPrefLoopAlignment(Align(1ULL << STI.getPrefLoopLogAlignment()));
918   setPrefFunctionAlignment(Align(1ULL << STI.getPrefFunctionLogAlignment()));
919 
920   // Only change the limit for entries in a jump table if specified by
921   // the sub target, but not at the command line.
922   unsigned MaxJT = STI.getMaximumJumpTableSize();
923   if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
924     setMaximumJumpTableSize(MaxJT);
925 
926   setHasExtractBitsInsn(true);
927 
928   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
929 
930   if (Subtarget->hasNEON()) {
931     // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
932     // silliness like this:
933     setOperationAction(ISD::FABS, MVT::v1f64, Expand);
934     setOperationAction(ISD::FADD, MVT::v1f64, Expand);
935     setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
936     setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
937     setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
938     setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
939     setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
940     setOperationAction(ISD::FMA, MVT::v1f64, Expand);
941     setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
942     setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
943     setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
944     setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
945     setOperationAction(ISD::FREM, MVT::v1f64, Expand);
946     setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
947     setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
948     setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
949     setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
950     setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
951     setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
952     setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
953     setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
954     setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
955     setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
956     setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
957     setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
958 
959     setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
960     setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
961     setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
962     setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
963     setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
964 
965     setOperationAction(ISD::MUL, MVT::v1i64, Expand);
966 
967     // AArch64 doesn't have a direct vector ->f32 conversion instructions for
968     // elements smaller than i32, so promote the input to i32 first.
969     setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
970     setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
971     // i8 vector elements also need promotion to i32 for v8i8
972     setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
973     setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
974     // Similarly, there is no direct i32 -> f64 vector conversion instruction.
975     setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
976     setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
977     setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
978     setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
979     // Or, direct i32 -> f16 vector conversion.  Set it so custom, so the
980     // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
981     setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
982     setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
983 
984     if (Subtarget->hasFullFP16()) {
985       setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
986       setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
987       setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
988       setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
989     } else {
990       // when AArch64 doesn't have fullfp16 support, promote the input
991       // to i32 first.
992       setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
993       setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
994       setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
995       setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
996     }
997 
998     setOperationAction(ISD::CTLZ,       MVT::v1i64, Expand);
999     setOperationAction(ISD::CTLZ,       MVT::v2i64, Expand);
1000 
1001     // AArch64 doesn't have MUL.2d:
1002     setOperationAction(ISD::MUL, MVT::v2i64, Expand);
1003     // Custom handling for some quad-vector types to detect MULL.
1004     setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1005     setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1006     setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1007 
1008     // Saturates
1009     for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1010                     MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1011       setOperationAction(ISD::SADDSAT, VT, Legal);
1012       setOperationAction(ISD::UADDSAT, VT, Legal);
1013       setOperationAction(ISD::SSUBSAT, VT, Legal);
1014       setOperationAction(ISD::USUBSAT, VT, Legal);
1015     }
1016 
1017     // Vector reductions
1018     for (MVT VT : { MVT::v4f16, MVT::v2f32,
1019                     MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1020       if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1021         setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1022         setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1023 
1024         setOperationAction(ISD::VECREDUCE_FADD, VT, Legal);
1025       }
1026     }
1027     for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1028                     MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1029       setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1030       setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1031       setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1032       setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1033       setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1034     }
1035     setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom);
1036 
1037     setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
1038     setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1039     // Likewise, narrowing and extending vector loads/stores aren't handled
1040     // directly.
1041     for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1042       setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
1043 
1044       if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1045         setOperationAction(ISD::MULHS, VT, Legal);
1046         setOperationAction(ISD::MULHU, VT, Legal);
1047       } else {
1048         setOperationAction(ISD::MULHS, VT, Expand);
1049         setOperationAction(ISD::MULHU, VT, Expand);
1050       }
1051       setOperationAction(ISD::SMUL_LOHI, VT, Expand);
1052       setOperationAction(ISD::UMUL_LOHI, VT, Expand);
1053 
1054       setOperationAction(ISD::BSWAP, VT, Expand);
1055       setOperationAction(ISD::CTTZ, VT, Expand);
1056 
1057       for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1058         setTruncStoreAction(VT, InnerVT, Expand);
1059         setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1060         setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1061         setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1062       }
1063     }
1064 
1065     // AArch64 has implementations of a lot of rounding-like FP operations.
1066     for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
1067       setOperationAction(ISD::FFLOOR, Ty, Legal);
1068       setOperationAction(ISD::FNEARBYINT, Ty, Legal);
1069       setOperationAction(ISD::FCEIL, Ty, Legal);
1070       setOperationAction(ISD::FRINT, Ty, Legal);
1071       setOperationAction(ISD::FTRUNC, Ty, Legal);
1072       setOperationAction(ISD::FROUND, Ty, Legal);
1073     }
1074 
1075     if (Subtarget->hasFullFP16()) {
1076       for (MVT Ty : {MVT::v4f16, MVT::v8f16}) {
1077         setOperationAction(ISD::FFLOOR, Ty, Legal);
1078         setOperationAction(ISD::FNEARBYINT, Ty, Legal);
1079         setOperationAction(ISD::FCEIL, Ty, Legal);
1080         setOperationAction(ISD::FRINT, Ty, Legal);
1081         setOperationAction(ISD::FTRUNC, Ty, Legal);
1082         setOperationAction(ISD::FROUND, Ty, Legal);
1083       }
1084     }
1085 
1086     if (Subtarget->hasSVE())
1087       setOperationAction(ISD::VSCALE, MVT::i32, Custom);
1088 
1089     setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1090   }
1091 
1092   if (Subtarget->hasSVE()) {
1093     // FIXME: Add custom lowering of MLOAD to handle different passthrus (not a
1094     // splat of 0 or undef) once vector selects supported in SVE codegen. See
1095     // D68877 for more details.
1096     for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1097       setOperationAction(ISD::BITREVERSE, VT, Custom);
1098       setOperationAction(ISD::BSWAP, VT, Custom);
1099       setOperationAction(ISD::CTLZ, VT, Custom);
1100       setOperationAction(ISD::CTPOP, VT, Custom);
1101       setOperationAction(ISD::CTTZ, VT, Custom);
1102       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1103       setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1104       setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1105       setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1106       setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1107       setOperationAction(ISD::MGATHER, VT, Custom);
1108       setOperationAction(ISD::MSCATTER, VT, Custom);
1109       setOperationAction(ISD::MUL, VT, Custom);
1110       setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
1111       setOperationAction(ISD::SELECT, VT, Custom);
1112       setOperationAction(ISD::SDIV, VT, Custom);
1113       setOperationAction(ISD::UDIV, VT, Custom);
1114       setOperationAction(ISD::SMIN, VT, Custom);
1115       setOperationAction(ISD::UMIN, VT, Custom);
1116       setOperationAction(ISD::SMAX, VT, Custom);
1117       setOperationAction(ISD::UMAX, VT, Custom);
1118       setOperationAction(ISD::SHL, VT, Custom);
1119       setOperationAction(ISD::SRL, VT, Custom);
1120       setOperationAction(ISD::SRA, VT, Custom);
1121       setOperationAction(ISD::ABS, VT, Custom);
1122       setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1123       setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1124       setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1125       setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1126       setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1127       setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1128       setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1129       setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1130     }
1131 
1132     // Illegal unpacked integer vector types.
1133     for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1134       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1135       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1136     }
1137 
1138     for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
1139       setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1140       setOperationAction(ISD::SELECT, VT, Custom);
1141       setOperationAction(ISD::SETCC, VT, Custom);
1142       setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
1143       setOperationAction(ISD::TRUNCATE, VT, Custom);
1144       setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1145       setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1146       setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1147 
1148       // There are no legal MVT::nxv16f## based types.
1149       if (VT != MVT::nxv16i1) {
1150         setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1151         setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1152       }
1153     }
1154 
1155     for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1156                     MVT::nxv4f32, MVT::nxv2f64}) {
1157       setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1158       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1159       setOperationAction(ISD::MGATHER, VT, Custom);
1160       setOperationAction(ISD::MSCATTER, VT, Custom);
1161       setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
1162       setOperationAction(ISD::SELECT, VT, Custom);
1163       setOperationAction(ISD::FADD, VT, Custom);
1164       setOperationAction(ISD::FDIV, VT, Custom);
1165       setOperationAction(ISD::FMA, VT, Custom);
1166       setOperationAction(ISD::FMAXNUM, VT, Custom);
1167       setOperationAction(ISD::FMINNUM, VT, Custom);
1168       setOperationAction(ISD::FMUL, VT, Custom);
1169       setOperationAction(ISD::FNEG, VT, Custom);
1170       setOperationAction(ISD::FSUB, VT, Custom);
1171       setOperationAction(ISD::FCEIL, VT, Custom);
1172       setOperationAction(ISD::FFLOOR, VT, Custom);
1173       setOperationAction(ISD::FNEARBYINT, VT, Custom);
1174       setOperationAction(ISD::FRINT, VT, Custom);
1175       setOperationAction(ISD::FROUND, VT, Custom);
1176       setOperationAction(ISD::FROUNDEVEN, VT, Custom);
1177       setOperationAction(ISD::FTRUNC, VT, Custom);
1178       setOperationAction(ISD::FSQRT, VT, Custom);
1179       setOperationAction(ISD::FABS, VT, Custom);
1180       setOperationAction(ISD::FP_EXTEND, VT, Custom);
1181       setOperationAction(ISD::FP_ROUND, VT, Custom);
1182       setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1183       setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1184       setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1185       setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1186     }
1187 
1188     for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1189       setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1190       setOperationAction(ISD::MGATHER, VT, Custom);
1191       setOperationAction(ISD::MSCATTER, VT, Custom);
1192     }
1193 
1194     setOperationAction(ISD::SPLAT_VECTOR, MVT::nxv8bf16, Custom);
1195 
1196     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
1197     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
1198 
1199     // NOTE: Currently this has to happen after computeRegisterProperties rather
1200     // than the preferred option of combining it with the addRegisterClass call.
1201     if (Subtarget->useSVEForFixedLengthVectors()) {
1202       for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
1203         if (useSVEForFixedLengthVectorVT(VT))
1204           addTypeForFixedLengthSVE(VT);
1205       for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
1206         if (useSVEForFixedLengthVectorVT(VT))
1207           addTypeForFixedLengthSVE(VT);
1208 
1209       // 64bit results can mean a bigger than NEON input.
1210       for (auto VT : {MVT::v8i8, MVT::v4i16})
1211         setOperationAction(ISD::TRUNCATE, VT, Custom);
1212       setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
1213 
1214       // 128bit results imply a bigger than NEON input.
1215       for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1216         setOperationAction(ISD::TRUNCATE, VT, Custom);
1217       for (auto VT : {MVT::v8f16, MVT::v4f32})
1218         setOperationAction(ISD::FP_ROUND, VT, Expand);
1219 
1220       // These operations are not supported on NEON but SVE can do them.
1221       setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
1222       setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1223       setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1224       setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1225       setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1226       setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1227       setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
1228       setOperationAction(ISD::SDIV, MVT::v16i8, Custom);
1229       setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
1230       setOperationAction(ISD::SDIV, MVT::v8i16, Custom);
1231       setOperationAction(ISD::SDIV, MVT::v2i32, Custom);
1232       setOperationAction(ISD::SDIV, MVT::v4i32, Custom);
1233       setOperationAction(ISD::SDIV, MVT::v1i64, Custom);
1234       setOperationAction(ISD::SDIV, MVT::v2i64, Custom);
1235       setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1236       setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1237       setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1238       setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1239       setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
1240       setOperationAction(ISD::UDIV, MVT::v16i8, Custom);
1241       setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
1242       setOperationAction(ISD::UDIV, MVT::v8i16, Custom);
1243       setOperationAction(ISD::UDIV, MVT::v2i32, Custom);
1244       setOperationAction(ISD::UDIV, MVT::v4i32, Custom);
1245       setOperationAction(ISD::UDIV, MVT::v1i64, Custom);
1246       setOperationAction(ISD::UDIV, MVT::v2i64, Custom);
1247       setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1248       setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1249       setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1250       setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1251       setOperationAction(ISD::VECREDUCE_SMAX, MVT::v2i64, Custom);
1252       setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom);
1253       setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom);
1254       setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom);
1255 
1256       // Int operations with no NEON support.
1257       for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1258                       MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1259         setOperationAction(ISD::BITREVERSE, VT, Custom);
1260         setOperationAction(ISD::CTTZ, VT, Custom);
1261         setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1262         setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1263         setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1264       }
1265 
1266       // FP operations with no NEON support.
1267       for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32,
1268                       MVT::v1f64, MVT::v2f64})
1269         setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1270 
1271       // Use SVE for vectors with more than 2 elements.
1272       for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1273         setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1274     }
1275   }
1276 
1277   PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1278 }
1279 
1280 void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
1281   assert(VT.isVector() && "VT should be a vector type");
1282 
1283   if (VT.isFloatingPoint()) {
1284     MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT();
1285     setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1286     setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1287   }
1288 
1289   // Mark vector float intrinsics as expand.
1290   if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1291     setOperationAction(ISD::FSIN, VT, Expand);
1292     setOperationAction(ISD::FCOS, VT, Expand);
1293     setOperationAction(ISD::FPOW, VT, Expand);
1294     setOperationAction(ISD::FLOG, VT, Expand);
1295     setOperationAction(ISD::FLOG2, VT, Expand);
1296     setOperationAction(ISD::FLOG10, VT, Expand);
1297     setOperationAction(ISD::FEXP, VT, Expand);
1298     setOperationAction(ISD::FEXP2, VT, Expand);
1299 
1300     // But we do support custom-lowering for FCOPYSIGN.
1301     setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1302   }
1303 
1304   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1305   setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1306   setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1307   setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1308   setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1309   setOperationAction(ISD::SRA, VT, Custom);
1310   setOperationAction(ISD::SRL, VT, Custom);
1311   setOperationAction(ISD::SHL, VT, Custom);
1312   setOperationAction(ISD::OR, VT, Custom);
1313   setOperationAction(ISD::SETCC, VT, Custom);
1314   setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
1315 
1316   setOperationAction(ISD::SELECT, VT, Expand);
1317   setOperationAction(ISD::SELECT_CC, VT, Expand);
1318   setOperationAction(ISD::VSELECT, VT, Expand);
1319   for (MVT InnerVT : MVT::all_valuetypes())
1320     setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1321 
1322   // CNT supports only B element sizes, then use UADDLP to widen.
1323   if (VT != MVT::v8i8 && VT != MVT::v16i8)
1324     setOperationAction(ISD::CTPOP, VT, Custom);
1325 
1326   setOperationAction(ISD::UDIV, VT, Expand);
1327   setOperationAction(ISD::SDIV, VT, Expand);
1328   setOperationAction(ISD::UREM, VT, Expand);
1329   setOperationAction(ISD::SREM, VT, Expand);
1330   setOperationAction(ISD::FREM, VT, Expand);
1331 
1332   setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1333   setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1334 
1335   if (!VT.isFloatingPoint())
1336     setOperationAction(ISD::ABS, VT, Legal);
1337 
1338   // [SU][MIN|MAX] are available for all NEON types apart from i64.
1339   if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1340     for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1341       setOperationAction(Opcode, VT, Legal);
1342 
1343   // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
1344   if (VT.isFloatingPoint() &&
1345       VT.getVectorElementType() != MVT::bf16 &&
1346       (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1347     for (unsigned Opcode :
1348          {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM})
1349       setOperationAction(Opcode, VT, Legal);
1350 
1351   if (Subtarget->isLittleEndian()) {
1352     for (unsigned im = (unsigned)ISD::PRE_INC;
1353          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
1354       setIndexedLoadAction(im, VT, Legal);
1355       setIndexedStoreAction(im, VT, Legal);
1356     }
1357   }
1358 }
1359 
1360 void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1361   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1362 
1363   // By default everything must be expanded.
1364   for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1365     setOperationAction(Op, VT, Expand);
1366 
1367   // We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one.
1368   setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1369 
1370   // Lower fixed length vector operations to scalable equivalents.
1371   setOperationAction(ISD::ABS, VT, Custom);
1372   setOperationAction(ISD::ADD, VT, Custom);
1373   setOperationAction(ISD::AND, VT, Custom);
1374   setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1375   setOperationAction(ISD::BITREVERSE, VT, Custom);
1376   setOperationAction(ISD::BSWAP, VT, Custom);
1377   setOperationAction(ISD::CTLZ, VT, Custom);
1378   setOperationAction(ISD::CTPOP, VT, Custom);
1379   setOperationAction(ISD::CTTZ, VT, Custom);
1380   setOperationAction(ISD::FADD, VT, Custom);
1381   setOperationAction(ISD::FCEIL, VT, Custom);
1382   setOperationAction(ISD::FDIV, VT, Custom);
1383   setOperationAction(ISD::FFLOOR, VT, Custom);
1384   setOperationAction(ISD::FMA, VT, Custom);
1385   setOperationAction(ISD::FMAXNUM, VT, Custom);
1386   setOperationAction(ISD::FMINNUM, VT, Custom);
1387   setOperationAction(ISD::FMUL, VT, Custom);
1388   setOperationAction(ISD::FNEARBYINT, VT, Custom);
1389   setOperationAction(ISD::FNEG, VT, Custom);
1390   setOperationAction(ISD::FRINT, VT, Custom);
1391   setOperationAction(ISD::FROUND, VT, Custom);
1392   setOperationAction(ISD::FSQRT, VT, Custom);
1393   setOperationAction(ISD::FSUB, VT, Custom);
1394   setOperationAction(ISD::FTRUNC, VT, Custom);
1395   setOperationAction(ISD::LOAD, VT, Custom);
1396   setOperationAction(ISD::MUL, VT, Custom);
1397   setOperationAction(ISD::OR, VT, Custom);
1398   setOperationAction(ISD::SDIV, VT, Custom);
1399   setOperationAction(ISD::SETCC, VT, Custom);
1400   setOperationAction(ISD::SHL, VT, Custom);
1401   setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1402   setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
1403   setOperationAction(ISD::SMAX, VT, Custom);
1404   setOperationAction(ISD::SMIN, VT, Custom);
1405   setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
1406   setOperationAction(ISD::SRA, VT, Custom);
1407   setOperationAction(ISD::SRL, VT, Custom);
1408   setOperationAction(ISD::STORE, VT, Custom);
1409   setOperationAction(ISD::SUB, VT, Custom);
1410   setOperationAction(ISD::TRUNCATE, VT, Custom);
1411   setOperationAction(ISD::UDIV, VT, Custom);
1412   setOperationAction(ISD::UMAX, VT, Custom);
1413   setOperationAction(ISD::UMIN, VT, Custom);
1414   setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1415   setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1416   setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1417   setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1418   setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1419   setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1420   setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1421   setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1422   setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1423   setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1424   setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1425   setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1426   setOperationAction(ISD::VSELECT, VT, Custom);
1427   setOperationAction(ISD::XOR, VT, Custom);
1428   setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1429 }
1430 
1431 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
1432   addRegisterClass(VT, &AArch64::FPR64RegClass);
1433   addTypeForNEON(VT, MVT::v2i32);
1434 }
1435 
1436 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
1437   addRegisterClass(VT, &AArch64::FPR128RegClass);
1438   addTypeForNEON(VT, MVT::v4i32);
1439 }
1440 
1441 EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &,
1442                                               LLVMContext &C, EVT VT) const {
1443   if (!VT.isVector())
1444     return MVT::i32;
1445   if (VT.isScalableVector())
1446     return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
1447   return VT.changeVectorElementTypeToInteger();
1448 }
1449 
1450 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
1451                                const APInt &Demanded,
1452                                TargetLowering::TargetLoweringOpt &TLO,
1453                                unsigned NewOpc) {
1454   uint64_t OldImm = Imm, NewImm, Enc;
1455   uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
1456 
1457   // Return if the immediate is already all zeros, all ones, a bimm32 or a
1458   // bimm64.
1459   if (Imm == 0 || Imm == Mask ||
1460       AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
1461     return false;
1462 
1463   unsigned EltSize = Size;
1464   uint64_t DemandedBits = Demanded.getZExtValue();
1465 
1466   // Clear bits that are not demanded.
1467   Imm &= DemandedBits;
1468 
1469   while (true) {
1470     // The goal here is to set the non-demanded bits in a way that minimizes
1471     // the number of switching between 0 and 1. In order to achieve this goal,
1472     // we set the non-demanded bits to the value of the preceding demanded bits.
1473     // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
1474     // non-demanded bit), we copy bit0 (1) to the least significant 'x',
1475     // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
1476     // The final result is 0b11000011.
1477     uint64_t NonDemandedBits = ~DemandedBits;
1478     uint64_t InvertedImm = ~Imm & DemandedBits;
1479     uint64_t RotatedImm =
1480         ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
1481         NonDemandedBits;
1482     uint64_t Sum = RotatedImm + NonDemandedBits;
1483     bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
1484     uint64_t Ones = (Sum + Carry) & NonDemandedBits;
1485     NewImm = (Imm | Ones) & Mask;
1486 
1487     // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
1488     // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
1489     // we halve the element size and continue the search.
1490     if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
1491       break;
1492 
1493     // We cannot shrink the element size any further if it is 2-bits.
1494     if (EltSize == 2)
1495       return false;
1496 
1497     EltSize /= 2;
1498     Mask >>= EltSize;
1499     uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
1500 
1501     // Return if there is mismatch in any of the demanded bits of Imm and Hi.
1502     if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
1503       return false;
1504 
1505     // Merge the upper and lower halves of Imm and DemandedBits.
1506     Imm |= Hi;
1507     DemandedBits |= DemandedBitsHi;
1508   }
1509 
1510   ++NumOptimizedImms;
1511 
1512   // Replicate the element across the register width.
1513   while (EltSize < Size) {
1514     NewImm |= NewImm << EltSize;
1515     EltSize *= 2;
1516   }
1517 
1518   (void)OldImm;
1519   assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
1520          "demanded bits should never be altered");
1521   assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
1522 
1523   // Create the new constant immediate node.
1524   EVT VT = Op.getValueType();
1525   SDLoc DL(Op);
1526   SDValue New;
1527 
1528   // If the new constant immediate is all-zeros or all-ones, let the target
1529   // independent DAG combine optimize this node.
1530   if (NewImm == 0 || NewImm == OrigMask) {
1531     New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
1532                           TLO.DAG.getConstant(NewImm, DL, VT));
1533   // Otherwise, create a machine node so that target independent DAG combine
1534   // doesn't undo this optimization.
1535   } else {
1536     Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
1537     SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
1538     New = SDValue(
1539         TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
1540   }
1541 
1542   return TLO.CombineTo(Op, New);
1543 }
1544 
1545 bool AArch64TargetLowering::targetShrinkDemandedConstant(
1546     SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
1547     TargetLoweringOpt &TLO) const {
1548   // Delay this optimization to as late as possible.
1549   if (!TLO.LegalOps)
1550     return false;
1551 
1552   if (!EnableOptimizeLogicalImm)
1553     return false;
1554 
1555   EVT VT = Op.getValueType();
1556   if (VT.isVector())
1557     return false;
1558 
1559   unsigned Size = VT.getSizeInBits();
1560   assert((Size == 32 || Size == 64) &&
1561          "i32 or i64 is expected after legalization.");
1562 
1563   // Exit early if we demand all bits.
1564   if (DemandedBits.countPopulation() == Size)
1565     return false;
1566 
1567   unsigned NewOpc;
1568   switch (Op.getOpcode()) {
1569   default:
1570     return false;
1571   case ISD::AND:
1572     NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
1573     break;
1574   case ISD::OR:
1575     NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
1576     break;
1577   case ISD::XOR:
1578     NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
1579     break;
1580   }
1581   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
1582   if (!C)
1583     return false;
1584   uint64_t Imm = C->getZExtValue();
1585   return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
1586 }
1587 
1588 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
1589 /// Mask are known to be either zero or one and return them Known.
1590 void AArch64TargetLowering::computeKnownBitsForTargetNode(
1591     const SDValue Op, KnownBits &Known,
1592     const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
1593   switch (Op.getOpcode()) {
1594   default:
1595     break;
1596   case AArch64ISD::CSEL: {
1597     KnownBits Known2;
1598     Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1599     Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1600     Known = KnownBits::commonBits(Known, Known2);
1601     break;
1602   }
1603   case AArch64ISD::LOADgot:
1604   case AArch64ISD::ADDlow: {
1605     if (!Subtarget->isTargetILP32())
1606       break;
1607     // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
1608     Known.Zero = APInt::getHighBitsSet(64, 32);
1609     break;
1610   }
1611   case ISD::INTRINSIC_W_CHAIN: {
1612     ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
1613     Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
1614     switch (IntID) {
1615     default: return;
1616     case Intrinsic::aarch64_ldaxr:
1617     case Intrinsic::aarch64_ldxr: {
1618       unsigned BitWidth = Known.getBitWidth();
1619       EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
1620       unsigned MemBits = VT.getScalarSizeInBits();
1621       Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
1622       return;
1623     }
1624     }
1625     break;
1626   }
1627   case ISD::INTRINSIC_WO_CHAIN:
1628   case ISD::INTRINSIC_VOID: {
1629     unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1630     switch (IntNo) {
1631     default:
1632       break;
1633     case Intrinsic::aarch64_neon_umaxv:
1634     case Intrinsic::aarch64_neon_uminv: {
1635       // Figure out the datatype of the vector operand. The UMINV instruction
1636       // will zero extend the result, so we can mark as known zero all the
1637       // bits larger than the element datatype. 32-bit or larget doesn't need
1638       // this as those are legal types and will be handled by isel directly.
1639       MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
1640       unsigned BitWidth = Known.getBitWidth();
1641       if (VT == MVT::v8i8 || VT == MVT::v16i8) {
1642         assert(BitWidth >= 8 && "Unexpected width!");
1643         APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
1644         Known.Zero |= Mask;
1645       } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
1646         assert(BitWidth >= 16 && "Unexpected width!");
1647         APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
1648         Known.Zero |= Mask;
1649       }
1650       break;
1651     } break;
1652     }
1653   }
1654   }
1655 }
1656 
1657 MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
1658                                                   EVT) const {
1659   return MVT::i64;
1660 }
1661 
1662 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
1663     EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
1664     bool *Fast) const {
1665   if (Subtarget->requiresStrictAlign())
1666     return false;
1667 
1668   if (Fast) {
1669     // Some CPUs are fine with unaligned stores except for 128-bit ones.
1670     *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
1671             // See comments in performSTORECombine() for more details about
1672             // these conditions.
1673 
1674             // Code that uses clang vector extensions can mark that it
1675             // wants unaligned accesses to be treated as fast by
1676             // underspecifying alignment to be 1 or 2.
1677             Align <= 2 ||
1678 
1679             // Disregard v2i64. Memcpy lowering produces those and splitting
1680             // them regresses performance on micro-benchmarks and olden/bh.
1681             VT == MVT::v2i64;
1682   }
1683   return true;
1684 }
1685 
1686 // Same as above but handling LLTs instead.
1687 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
1688     LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1689     bool *Fast) const {
1690   if (Subtarget->requiresStrictAlign())
1691     return false;
1692 
1693   if (Fast) {
1694     // Some CPUs are fine with unaligned stores except for 128-bit ones.
1695     *Fast = !Subtarget->isMisaligned128StoreSlow() ||
1696             Ty.getSizeInBytes() != 16 ||
1697             // See comments in performSTORECombine() for more details about
1698             // these conditions.
1699 
1700             // Code that uses clang vector extensions can mark that it
1701             // wants unaligned accesses to be treated as fast by
1702             // underspecifying alignment to be 1 or 2.
1703             Alignment <= 2 ||
1704 
1705             // Disregard v2i64. Memcpy lowering produces those and splitting
1706             // them regresses performance on micro-benchmarks and olden/bh.
1707             Ty == LLT::vector(2, 64);
1708   }
1709   return true;
1710 }
1711 
1712 FastISel *
1713 AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
1714                                       const TargetLibraryInfo *libInfo) const {
1715   return AArch64::createFastISel(funcInfo, libInfo);
1716 }
1717 
1718 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
1719 #define MAKE_CASE(V)                                                           \
1720   case V:                                                                      \
1721     return #V;
1722   switch ((AArch64ISD::NodeType)Opcode) {
1723   case AArch64ISD::FIRST_NUMBER:
1724     break;
1725     MAKE_CASE(AArch64ISD::CALL)
1726     MAKE_CASE(AArch64ISD::ADRP)
1727     MAKE_CASE(AArch64ISD::ADR)
1728     MAKE_CASE(AArch64ISD::ADDlow)
1729     MAKE_CASE(AArch64ISD::LOADgot)
1730     MAKE_CASE(AArch64ISD::RET_FLAG)
1731     MAKE_CASE(AArch64ISD::BRCOND)
1732     MAKE_CASE(AArch64ISD::CSEL)
1733     MAKE_CASE(AArch64ISD::FCSEL)
1734     MAKE_CASE(AArch64ISD::CSINV)
1735     MAKE_CASE(AArch64ISD::CSNEG)
1736     MAKE_CASE(AArch64ISD::CSINC)
1737     MAKE_CASE(AArch64ISD::THREAD_POINTER)
1738     MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
1739     MAKE_CASE(AArch64ISD::ADD_PRED)
1740     MAKE_CASE(AArch64ISD::MUL_PRED)
1741     MAKE_CASE(AArch64ISD::SDIV_PRED)
1742     MAKE_CASE(AArch64ISD::SHL_PRED)
1743     MAKE_CASE(AArch64ISD::SMAX_PRED)
1744     MAKE_CASE(AArch64ISD::SMIN_PRED)
1745     MAKE_CASE(AArch64ISD::SRA_PRED)
1746     MAKE_CASE(AArch64ISD::SRL_PRED)
1747     MAKE_CASE(AArch64ISD::SUB_PRED)
1748     MAKE_CASE(AArch64ISD::UDIV_PRED)
1749     MAKE_CASE(AArch64ISD::UMAX_PRED)
1750     MAKE_CASE(AArch64ISD::UMIN_PRED)
1751     MAKE_CASE(AArch64ISD::FNEG_MERGE_PASSTHRU)
1752     MAKE_CASE(AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU)
1753     MAKE_CASE(AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU)
1754     MAKE_CASE(AArch64ISD::FCEIL_MERGE_PASSTHRU)
1755     MAKE_CASE(AArch64ISD::FFLOOR_MERGE_PASSTHRU)
1756     MAKE_CASE(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU)
1757     MAKE_CASE(AArch64ISD::FRINT_MERGE_PASSTHRU)
1758     MAKE_CASE(AArch64ISD::FROUND_MERGE_PASSTHRU)
1759     MAKE_CASE(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU)
1760     MAKE_CASE(AArch64ISD::FTRUNC_MERGE_PASSTHRU)
1761     MAKE_CASE(AArch64ISD::FP_ROUND_MERGE_PASSTHRU)
1762     MAKE_CASE(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU)
1763     MAKE_CASE(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU)
1764     MAKE_CASE(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU)
1765     MAKE_CASE(AArch64ISD::FCVTZU_MERGE_PASSTHRU)
1766     MAKE_CASE(AArch64ISD::FCVTZS_MERGE_PASSTHRU)
1767     MAKE_CASE(AArch64ISD::FSQRT_MERGE_PASSTHRU)
1768     MAKE_CASE(AArch64ISD::FRECPX_MERGE_PASSTHRU)
1769     MAKE_CASE(AArch64ISD::FABS_MERGE_PASSTHRU)
1770     MAKE_CASE(AArch64ISD::ABS_MERGE_PASSTHRU)
1771     MAKE_CASE(AArch64ISD::NEG_MERGE_PASSTHRU)
1772     MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO)
1773     MAKE_CASE(AArch64ISD::ADC)
1774     MAKE_CASE(AArch64ISD::SBC)
1775     MAKE_CASE(AArch64ISD::ADDS)
1776     MAKE_CASE(AArch64ISD::SUBS)
1777     MAKE_CASE(AArch64ISD::ADCS)
1778     MAKE_CASE(AArch64ISD::SBCS)
1779     MAKE_CASE(AArch64ISD::ANDS)
1780     MAKE_CASE(AArch64ISD::CCMP)
1781     MAKE_CASE(AArch64ISD::CCMN)
1782     MAKE_CASE(AArch64ISD::FCCMP)
1783     MAKE_CASE(AArch64ISD::FCMP)
1784     MAKE_CASE(AArch64ISD::STRICT_FCMP)
1785     MAKE_CASE(AArch64ISD::STRICT_FCMPE)
1786     MAKE_CASE(AArch64ISD::DUP)
1787     MAKE_CASE(AArch64ISD::DUPLANE8)
1788     MAKE_CASE(AArch64ISD::DUPLANE16)
1789     MAKE_CASE(AArch64ISD::DUPLANE32)
1790     MAKE_CASE(AArch64ISD::DUPLANE64)
1791     MAKE_CASE(AArch64ISD::MOVI)
1792     MAKE_CASE(AArch64ISD::MOVIshift)
1793     MAKE_CASE(AArch64ISD::MOVIedit)
1794     MAKE_CASE(AArch64ISD::MOVImsl)
1795     MAKE_CASE(AArch64ISD::FMOV)
1796     MAKE_CASE(AArch64ISD::MVNIshift)
1797     MAKE_CASE(AArch64ISD::MVNImsl)
1798     MAKE_CASE(AArch64ISD::BICi)
1799     MAKE_CASE(AArch64ISD::ORRi)
1800     MAKE_CASE(AArch64ISD::BSP)
1801     MAKE_CASE(AArch64ISD::NEG)
1802     MAKE_CASE(AArch64ISD::EXTR)
1803     MAKE_CASE(AArch64ISD::ZIP1)
1804     MAKE_CASE(AArch64ISD::ZIP2)
1805     MAKE_CASE(AArch64ISD::UZP1)
1806     MAKE_CASE(AArch64ISD::UZP2)
1807     MAKE_CASE(AArch64ISD::TRN1)
1808     MAKE_CASE(AArch64ISD::TRN2)
1809     MAKE_CASE(AArch64ISD::REV16)
1810     MAKE_CASE(AArch64ISD::REV32)
1811     MAKE_CASE(AArch64ISD::REV64)
1812     MAKE_CASE(AArch64ISD::EXT)
1813     MAKE_CASE(AArch64ISD::VSHL)
1814     MAKE_CASE(AArch64ISD::VLSHR)
1815     MAKE_CASE(AArch64ISD::VASHR)
1816     MAKE_CASE(AArch64ISD::VSLI)
1817     MAKE_CASE(AArch64ISD::VSRI)
1818     MAKE_CASE(AArch64ISD::CMEQ)
1819     MAKE_CASE(AArch64ISD::CMGE)
1820     MAKE_CASE(AArch64ISD::CMGT)
1821     MAKE_CASE(AArch64ISD::CMHI)
1822     MAKE_CASE(AArch64ISD::CMHS)
1823     MAKE_CASE(AArch64ISD::FCMEQ)
1824     MAKE_CASE(AArch64ISD::FCMGE)
1825     MAKE_CASE(AArch64ISD::FCMGT)
1826     MAKE_CASE(AArch64ISD::CMEQz)
1827     MAKE_CASE(AArch64ISD::CMGEz)
1828     MAKE_CASE(AArch64ISD::CMGTz)
1829     MAKE_CASE(AArch64ISD::CMLEz)
1830     MAKE_CASE(AArch64ISD::CMLTz)
1831     MAKE_CASE(AArch64ISD::FCMEQz)
1832     MAKE_CASE(AArch64ISD::FCMGEz)
1833     MAKE_CASE(AArch64ISD::FCMGTz)
1834     MAKE_CASE(AArch64ISD::FCMLEz)
1835     MAKE_CASE(AArch64ISD::FCMLTz)
1836     MAKE_CASE(AArch64ISD::SADDV)
1837     MAKE_CASE(AArch64ISD::UADDV)
1838     MAKE_CASE(AArch64ISD::SRHADD)
1839     MAKE_CASE(AArch64ISD::URHADD)
1840     MAKE_CASE(AArch64ISD::SHADD)
1841     MAKE_CASE(AArch64ISD::UHADD)
1842     MAKE_CASE(AArch64ISD::SMINV)
1843     MAKE_CASE(AArch64ISD::UMINV)
1844     MAKE_CASE(AArch64ISD::SMAXV)
1845     MAKE_CASE(AArch64ISD::UMAXV)
1846     MAKE_CASE(AArch64ISD::SADDV_PRED)
1847     MAKE_CASE(AArch64ISD::UADDV_PRED)
1848     MAKE_CASE(AArch64ISD::SMAXV_PRED)
1849     MAKE_CASE(AArch64ISD::UMAXV_PRED)
1850     MAKE_CASE(AArch64ISD::SMINV_PRED)
1851     MAKE_CASE(AArch64ISD::UMINV_PRED)
1852     MAKE_CASE(AArch64ISD::ORV_PRED)
1853     MAKE_CASE(AArch64ISD::EORV_PRED)
1854     MAKE_CASE(AArch64ISD::ANDV_PRED)
1855     MAKE_CASE(AArch64ISD::CLASTA_N)
1856     MAKE_CASE(AArch64ISD::CLASTB_N)
1857     MAKE_CASE(AArch64ISD::LASTA)
1858     MAKE_CASE(AArch64ISD::LASTB)
1859     MAKE_CASE(AArch64ISD::REV)
1860     MAKE_CASE(AArch64ISD::REINTERPRET_CAST)
1861     MAKE_CASE(AArch64ISD::TBL)
1862     MAKE_CASE(AArch64ISD::FADD_PRED)
1863     MAKE_CASE(AArch64ISD::FADDA_PRED)
1864     MAKE_CASE(AArch64ISD::FADDV_PRED)
1865     MAKE_CASE(AArch64ISD::FDIV_PRED)
1866     MAKE_CASE(AArch64ISD::FMA_PRED)
1867     MAKE_CASE(AArch64ISD::FMAXV_PRED)
1868     MAKE_CASE(AArch64ISD::FMAXNM_PRED)
1869     MAKE_CASE(AArch64ISD::FMAXNMV_PRED)
1870     MAKE_CASE(AArch64ISD::FMINV_PRED)
1871     MAKE_CASE(AArch64ISD::FMINNM_PRED)
1872     MAKE_CASE(AArch64ISD::FMINNMV_PRED)
1873     MAKE_CASE(AArch64ISD::FMUL_PRED)
1874     MAKE_CASE(AArch64ISD::FSUB_PRED)
1875     MAKE_CASE(AArch64ISD::BIT)
1876     MAKE_CASE(AArch64ISD::CBZ)
1877     MAKE_CASE(AArch64ISD::CBNZ)
1878     MAKE_CASE(AArch64ISD::TBZ)
1879     MAKE_CASE(AArch64ISD::TBNZ)
1880     MAKE_CASE(AArch64ISD::TC_RETURN)
1881     MAKE_CASE(AArch64ISD::PREFETCH)
1882     MAKE_CASE(AArch64ISD::SITOF)
1883     MAKE_CASE(AArch64ISD::UITOF)
1884     MAKE_CASE(AArch64ISD::NVCAST)
1885     MAKE_CASE(AArch64ISD::SQSHL_I)
1886     MAKE_CASE(AArch64ISD::UQSHL_I)
1887     MAKE_CASE(AArch64ISD::SRSHR_I)
1888     MAKE_CASE(AArch64ISD::URSHR_I)
1889     MAKE_CASE(AArch64ISD::SQSHLU_I)
1890     MAKE_CASE(AArch64ISD::WrapperLarge)
1891     MAKE_CASE(AArch64ISD::LD2post)
1892     MAKE_CASE(AArch64ISD::LD3post)
1893     MAKE_CASE(AArch64ISD::LD4post)
1894     MAKE_CASE(AArch64ISD::ST2post)
1895     MAKE_CASE(AArch64ISD::ST3post)
1896     MAKE_CASE(AArch64ISD::ST4post)
1897     MAKE_CASE(AArch64ISD::LD1x2post)
1898     MAKE_CASE(AArch64ISD::LD1x3post)
1899     MAKE_CASE(AArch64ISD::LD1x4post)
1900     MAKE_CASE(AArch64ISD::ST1x2post)
1901     MAKE_CASE(AArch64ISD::ST1x3post)
1902     MAKE_CASE(AArch64ISD::ST1x4post)
1903     MAKE_CASE(AArch64ISD::LD1DUPpost)
1904     MAKE_CASE(AArch64ISD::LD2DUPpost)
1905     MAKE_CASE(AArch64ISD::LD3DUPpost)
1906     MAKE_CASE(AArch64ISD::LD4DUPpost)
1907     MAKE_CASE(AArch64ISD::LD1LANEpost)
1908     MAKE_CASE(AArch64ISD::LD2LANEpost)
1909     MAKE_CASE(AArch64ISD::LD3LANEpost)
1910     MAKE_CASE(AArch64ISD::LD4LANEpost)
1911     MAKE_CASE(AArch64ISD::ST2LANEpost)
1912     MAKE_CASE(AArch64ISD::ST3LANEpost)
1913     MAKE_CASE(AArch64ISD::ST4LANEpost)
1914     MAKE_CASE(AArch64ISD::SMULL)
1915     MAKE_CASE(AArch64ISD::UMULL)
1916     MAKE_CASE(AArch64ISD::FRECPE)
1917     MAKE_CASE(AArch64ISD::FRECPS)
1918     MAKE_CASE(AArch64ISD::FRSQRTE)
1919     MAKE_CASE(AArch64ISD::FRSQRTS)
1920     MAKE_CASE(AArch64ISD::STG)
1921     MAKE_CASE(AArch64ISD::STZG)
1922     MAKE_CASE(AArch64ISD::ST2G)
1923     MAKE_CASE(AArch64ISD::STZ2G)
1924     MAKE_CASE(AArch64ISD::SUNPKHI)
1925     MAKE_CASE(AArch64ISD::SUNPKLO)
1926     MAKE_CASE(AArch64ISD::UUNPKHI)
1927     MAKE_CASE(AArch64ISD::UUNPKLO)
1928     MAKE_CASE(AArch64ISD::INSR)
1929     MAKE_CASE(AArch64ISD::PTEST)
1930     MAKE_CASE(AArch64ISD::PTRUE)
1931     MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO)
1932     MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO)
1933     MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO)
1934     MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO)
1935     MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO)
1936     MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO)
1937     MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO)
1938     MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO)
1939     MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO)
1940     MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO)
1941     MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO)
1942     MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO)
1943     MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO)
1944     MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO)
1945     MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO)
1946     MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO)
1947     MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO)
1948     MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO)
1949     MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO)
1950     MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO)
1951     MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO)
1952     MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO)
1953     MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO)
1954     MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO)
1955     MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO)
1956     MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO)
1957     MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO)
1958     MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO)
1959     MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO)
1960     MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO)
1961     MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO)
1962     MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO)
1963     MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO)
1964     MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO)
1965     MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO)
1966     MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO)
1967     MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO)
1968     MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO)
1969     MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO)
1970     MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO)
1971     MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO)
1972     MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO)
1973     MAKE_CASE(AArch64ISD::ST1_PRED)
1974     MAKE_CASE(AArch64ISD::SST1_PRED)
1975     MAKE_CASE(AArch64ISD::SST1_SCALED_PRED)
1976     MAKE_CASE(AArch64ISD::SST1_SXTW_PRED)
1977     MAKE_CASE(AArch64ISD::SST1_UXTW_PRED)
1978     MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED)
1979     MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED)
1980     MAKE_CASE(AArch64ISD::SST1_IMM_PRED)
1981     MAKE_CASE(AArch64ISD::SSTNT1_PRED)
1982     MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED)
1983     MAKE_CASE(AArch64ISD::LDP)
1984     MAKE_CASE(AArch64ISD::STP)
1985     MAKE_CASE(AArch64ISD::STNP)
1986     MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU)
1987     MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU)
1988     MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU)
1989     MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU)
1990     MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU)
1991     MAKE_CASE(AArch64ISD::INDEX_VECTOR)
1992     MAKE_CASE(AArch64ISD::UABD)
1993     MAKE_CASE(AArch64ISD::SABD)
1994     MAKE_CASE(AArch64ISD::CALL_RVMARKER)
1995   }
1996 #undef MAKE_CASE
1997   return nullptr;
1998 }
1999 
2000 MachineBasicBlock *
2001 AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
2002                                     MachineBasicBlock *MBB) const {
2003   // We materialise the F128CSEL pseudo-instruction as some control flow and a
2004   // phi node:
2005 
2006   // OrigBB:
2007   //     [... previous instrs leading to comparison ...]
2008   //     b.ne TrueBB
2009   //     b EndBB
2010   // TrueBB:
2011   //     ; Fallthrough
2012   // EndBB:
2013   //     Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2014 
2015   MachineFunction *MF = MBB->getParent();
2016   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2017   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2018   DebugLoc DL = MI.getDebugLoc();
2019   MachineFunction::iterator It = ++MBB->getIterator();
2020 
2021   Register DestReg = MI.getOperand(0).getReg();
2022   Register IfTrueReg = MI.getOperand(1).getReg();
2023   Register IfFalseReg = MI.getOperand(2).getReg();
2024   unsigned CondCode = MI.getOperand(3).getImm();
2025   bool NZCVKilled = MI.getOperand(4).isKill();
2026 
2027   MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2028   MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2029   MF->insert(It, TrueBB);
2030   MF->insert(It, EndBB);
2031 
2032   // Transfer rest of current basic-block to EndBB
2033   EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2034                 MBB->end());
2035   EndBB->transferSuccessorsAndUpdatePHIs(MBB);
2036 
2037   BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2038   BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2039   MBB->addSuccessor(TrueBB);
2040   MBB->addSuccessor(EndBB);
2041 
2042   // TrueBB falls through to the end.
2043   TrueBB->addSuccessor(EndBB);
2044 
2045   if (!NZCVKilled) {
2046     TrueBB->addLiveIn(AArch64::NZCV);
2047     EndBB->addLiveIn(AArch64::NZCV);
2048   }
2049 
2050   BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2051       .addReg(IfTrueReg)
2052       .addMBB(TrueBB)
2053       .addReg(IfFalseReg)
2054       .addMBB(MBB);
2055 
2056   MI.eraseFromParent();
2057   return EndBB;
2058 }
2059 
2060 MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
2061        MachineInstr &MI, MachineBasicBlock *BB) const {
2062   assert(!isAsynchronousEHPersonality(classifyEHPersonality(
2063              BB->getParent()->getFunction().getPersonalityFn())) &&
2064          "SEH does not use catchret!");
2065   return BB;
2066 }
2067 
2068 MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
2069     MachineInstr &MI, MachineBasicBlock *BB) const {
2070   switch (MI.getOpcode()) {
2071   default:
2072 #ifndef NDEBUG
2073     MI.dump();
2074 #endif
2075     llvm_unreachable("Unexpected instruction for custom inserter!");
2076 
2077   case AArch64::F128CSEL:
2078     return EmitF128CSEL(MI, BB);
2079 
2080   case TargetOpcode::STACKMAP:
2081   case TargetOpcode::PATCHPOINT:
2082   case TargetOpcode::STATEPOINT:
2083     return emitPatchPoint(MI, BB);
2084 
2085   case AArch64::CATCHRET:
2086     return EmitLoweredCatchRet(MI, BB);
2087   }
2088 }
2089 
2090 //===----------------------------------------------------------------------===//
2091 // AArch64 Lowering private implementation.
2092 //===----------------------------------------------------------------------===//
2093 
2094 //===----------------------------------------------------------------------===//
2095 // Lowering Code
2096 //===----------------------------------------------------------------------===//
2097 
2098 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
2099 /// CC
2100 static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
2101   switch (CC) {
2102   default:
2103     llvm_unreachable("Unknown condition code!");
2104   case ISD::SETNE:
2105     return AArch64CC::NE;
2106   case ISD::SETEQ:
2107     return AArch64CC::EQ;
2108   case ISD::SETGT:
2109     return AArch64CC::GT;
2110   case ISD::SETGE:
2111     return AArch64CC::GE;
2112   case ISD::SETLT:
2113     return AArch64CC::LT;
2114   case ISD::SETLE:
2115     return AArch64CC::LE;
2116   case ISD::SETUGT:
2117     return AArch64CC::HI;
2118   case ISD::SETUGE:
2119     return AArch64CC::HS;
2120   case ISD::SETULT:
2121     return AArch64CC::LO;
2122   case ISD::SETULE:
2123     return AArch64CC::LS;
2124   }
2125 }
2126 
2127 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
2128 static void changeFPCCToAArch64CC(ISD::CondCode CC,
2129                                   AArch64CC::CondCode &CondCode,
2130                                   AArch64CC::CondCode &CondCode2) {
2131   CondCode2 = AArch64CC::AL;
2132   switch (CC) {
2133   default:
2134     llvm_unreachable("Unknown FP condition!");
2135   case ISD::SETEQ:
2136   case ISD::SETOEQ:
2137     CondCode = AArch64CC::EQ;
2138     break;
2139   case ISD::SETGT:
2140   case ISD::SETOGT:
2141     CondCode = AArch64CC::GT;
2142     break;
2143   case ISD::SETGE:
2144   case ISD::SETOGE:
2145     CondCode = AArch64CC::GE;
2146     break;
2147   case ISD::SETOLT:
2148     CondCode = AArch64CC::MI;
2149     break;
2150   case ISD::SETOLE:
2151     CondCode = AArch64CC::LS;
2152     break;
2153   case ISD::SETONE:
2154     CondCode = AArch64CC::MI;
2155     CondCode2 = AArch64CC::GT;
2156     break;
2157   case ISD::SETO:
2158     CondCode = AArch64CC::VC;
2159     break;
2160   case ISD::SETUO:
2161     CondCode = AArch64CC::VS;
2162     break;
2163   case ISD::SETUEQ:
2164     CondCode = AArch64CC::EQ;
2165     CondCode2 = AArch64CC::VS;
2166     break;
2167   case ISD::SETUGT:
2168     CondCode = AArch64CC::HI;
2169     break;
2170   case ISD::SETUGE:
2171     CondCode = AArch64CC::PL;
2172     break;
2173   case ISD::SETLT:
2174   case ISD::SETULT:
2175     CondCode = AArch64CC::LT;
2176     break;
2177   case ISD::SETLE:
2178   case ISD::SETULE:
2179     CondCode = AArch64CC::LE;
2180     break;
2181   case ISD::SETNE:
2182   case ISD::SETUNE:
2183     CondCode = AArch64CC::NE;
2184     break;
2185   }
2186 }
2187 
2188 /// Convert a DAG fp condition code to an AArch64 CC.
2189 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
2190 /// should be AND'ed instead of OR'ed.
2191 static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
2192                                      AArch64CC::CondCode &CondCode,
2193                                      AArch64CC::CondCode &CondCode2) {
2194   CondCode2 = AArch64CC::AL;
2195   switch (CC) {
2196   default:
2197     changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2198     assert(CondCode2 == AArch64CC::AL);
2199     break;
2200   case ISD::SETONE:
2201     // (a one b)
2202     // == ((a olt b) || (a ogt b))
2203     // == ((a ord b) && (a une b))
2204     CondCode = AArch64CC::VC;
2205     CondCode2 = AArch64CC::NE;
2206     break;
2207   case ISD::SETUEQ:
2208     // (a ueq b)
2209     // == ((a uno b) || (a oeq b))
2210     // == ((a ule b) && (a uge b))
2211     CondCode = AArch64CC::PL;
2212     CondCode2 = AArch64CC::LE;
2213     break;
2214   }
2215 }
2216 
2217 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
2218 /// CC usable with the vector instructions. Fewer operations are available
2219 /// without a real NZCV register, so we have to use less efficient combinations
2220 /// to get the same effect.
2221 static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
2222                                         AArch64CC::CondCode &CondCode,
2223                                         AArch64CC::CondCode &CondCode2,
2224                                         bool &Invert) {
2225   Invert = false;
2226   switch (CC) {
2227   default:
2228     // Mostly the scalar mappings work fine.
2229     changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2230     break;
2231   case ISD::SETUO:
2232     Invert = true;
2233     LLVM_FALLTHROUGH;
2234   case ISD::SETO:
2235     CondCode = AArch64CC::MI;
2236     CondCode2 = AArch64CC::GE;
2237     break;
2238   case ISD::SETUEQ:
2239   case ISD::SETULT:
2240   case ISD::SETULE:
2241   case ISD::SETUGT:
2242   case ISD::SETUGE:
2243     // All of the compare-mask comparisons are ordered, but we can switch
2244     // between the two by a double inversion. E.g. ULE == !OGT.
2245     Invert = true;
2246     changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
2247                           CondCode, CondCode2);
2248     break;
2249   }
2250 }
2251 
2252 static bool isLegalArithImmed(uint64_t C) {
2253   // Matches AArch64DAGToDAGISel::SelectArithImmed().
2254   bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
2255   LLVM_DEBUG(dbgs() << "Is imm " << C
2256                     << " legal: " << (IsLegal ? "yes\n" : "no\n"));
2257   return IsLegal;
2258 }
2259 
2260 // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
2261 // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
2262 // can be set differently by this operation. It comes down to whether
2263 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
2264 // everything is fine. If not then the optimization is wrong. Thus general
2265 // comparisons are only valid if op2 != 0.
2266 //
2267 // So, finally, the only LLVM-native comparisons that don't mention C and V
2268 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
2269 // the absence of information about op2.
2270 static bool isCMN(SDValue Op, ISD::CondCode CC) {
2271   return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
2272          (CC == ISD::SETEQ || CC == ISD::SETNE);
2273 }
2274 
2275 static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
2276                                       SelectionDAG &DAG, SDValue Chain,
2277                                       bool IsSignaling) {
2278   EVT VT = LHS.getValueType();
2279   assert(VT != MVT::f128);
2280   assert(VT != MVT::f16 && "Lowering of strict fp16 not yet implemented");
2281   unsigned Opcode =
2282       IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
2283   return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
2284 }
2285 
2286 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
2287                               const SDLoc &dl, SelectionDAG &DAG) {
2288   EVT VT = LHS.getValueType();
2289   const bool FullFP16 =
2290     static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
2291 
2292   if (VT.isFloatingPoint()) {
2293     assert(VT != MVT::f128);
2294     if (VT == MVT::f16 && !FullFP16) {
2295       LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
2296       RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
2297       VT = MVT::f32;
2298     }
2299     return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
2300   }
2301 
2302   // The CMP instruction is just an alias for SUBS, and representing it as
2303   // SUBS means that it's possible to get CSE with subtract operations.
2304   // A later phase can perform the optimization of setting the destination
2305   // register to WZR/XZR if it ends up being unused.
2306   unsigned Opcode = AArch64ISD::SUBS;
2307 
2308   if (isCMN(RHS, CC)) {
2309     // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
2310     Opcode = AArch64ISD::ADDS;
2311     RHS = RHS.getOperand(1);
2312   } else if (isCMN(LHS, CC)) {
2313     // As we are looking for EQ/NE compares, the operands can be commuted ; can
2314     // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
2315     Opcode = AArch64ISD::ADDS;
2316     LHS = LHS.getOperand(1);
2317   } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
2318     if (LHS.getOpcode() == ISD::AND) {
2319       // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
2320       // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
2321       // of the signed comparisons.
2322       const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
2323                                            DAG.getVTList(VT, MVT_CC),
2324                                            LHS.getOperand(0),
2325                                            LHS.getOperand(1));
2326       // Replace all users of (and X, Y) with newly generated (ands X, Y)
2327       DAG.ReplaceAllUsesWith(LHS, ANDSNode);
2328       return ANDSNode.getValue(1);
2329     } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
2330       // Use result of ANDS
2331       return LHS.getValue(1);
2332     }
2333   }
2334 
2335   return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
2336       .getValue(1);
2337 }
2338 
2339 /// \defgroup AArch64CCMP CMP;CCMP matching
2340 ///
2341 /// These functions deal with the formation of CMP;CCMP;... sequences.
2342 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
2343 /// a comparison. They set the NZCV flags to a predefined value if their
2344 /// predicate is false. This allows to express arbitrary conjunctions, for
2345 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
2346 /// expressed as:
2347 ///   cmp A
2348 ///   ccmp B, inv(CB), CA
2349 ///   check for CB flags
2350 ///
2351 /// This naturally lets us implement chains of AND operations with SETCC
2352 /// operands. And we can even implement some other situations by transforming
2353 /// them:
2354 ///   - We can implement (NEG SETCC) i.e. negating a single comparison by
2355 ///     negating the flags used in a CCMP/FCCMP operations.
2356 ///   - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
2357 ///     by negating the flags we test for afterwards. i.e.
2358 ///     NEG (CMP CCMP CCCMP ...) can be implemented.
2359 ///   - Note that we can only ever negate all previously processed results.
2360 ///     What we can not implement by flipping the flags to test is a negation
2361 ///     of two sub-trees (because the negation affects all sub-trees emitted so
2362 ///     far, so the 2nd sub-tree we emit would also affect the first).
2363 /// With those tools we can implement some OR operations:
2364 ///   - (OR (SETCC A) (SETCC B)) can be implemented via:
2365 ///     NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
2366 ///   - After transforming OR to NEG/AND combinations we may be able to use NEG
2367 ///     elimination rules from earlier to implement the whole thing as a
2368 ///     CCMP/FCCMP chain.
2369 ///
2370 /// As complete example:
2371 ///     or (or (setCA (cmp A)) (setCB (cmp B)))
2372 ///        (and (setCC (cmp C)) (setCD (cmp D)))"
2373 /// can be reassociated to:
2374 ///     or (and (setCC (cmp C)) setCD (cmp D))
2375 //         (or (setCA (cmp A)) (setCB (cmp B)))
2376 /// can be transformed to:
2377 ///     not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
2378 ///              (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
2379 /// which can be implemented as:
2380 ///   cmp C
2381 ///   ccmp D, inv(CD), CC
2382 ///   ccmp A, CA, inv(CD)
2383 ///   ccmp B, CB, inv(CA)
2384 ///   check for CB flags
2385 ///
2386 /// A counterexample is "or (and A B) (and C D)" which translates to
2387 /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
2388 /// can only implement 1 of the inner (not) operations, but not both!
2389 /// @{
2390 
2391 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
2392 static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
2393                                          ISD::CondCode CC, SDValue CCOp,
2394                                          AArch64CC::CondCode Predicate,
2395                                          AArch64CC::CondCode OutCC,
2396                                          const SDLoc &DL, SelectionDAG &DAG) {
2397   unsigned Opcode = 0;
2398   const bool FullFP16 =
2399     static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
2400 
2401   if (LHS.getValueType().isFloatingPoint()) {
2402     assert(LHS.getValueType() != MVT::f128);
2403     if (LHS.getValueType() == MVT::f16 && !FullFP16) {
2404       LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
2405       RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
2406     }
2407     Opcode = AArch64ISD::FCCMP;
2408   } else if (RHS.getOpcode() == ISD::SUB) {
2409     SDValue SubOp0 = RHS.getOperand(0);
2410     if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
2411       // See emitComparison() on why we can only do this for SETEQ and SETNE.
2412       Opcode = AArch64ISD::CCMN;
2413       RHS = RHS.getOperand(1);
2414     }
2415   }
2416   if (Opcode == 0)
2417     Opcode = AArch64ISD::CCMP;
2418 
2419   SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
2420   AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
2421   unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
2422   SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
2423   return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
2424 }
2425 
2426 /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
2427 /// expressed as a conjunction. See \ref AArch64CCMP.
2428 /// \param CanNegate    Set to true if we can negate the whole sub-tree just by
2429 ///                     changing the conditions on the SETCC tests.
2430 ///                     (this means we can call emitConjunctionRec() with
2431 ///                      Negate==true on this sub-tree)
2432 /// \param MustBeFirst  Set to true if this subtree needs to be negated and we
2433 ///                     cannot do the negation naturally. We are required to
2434 ///                     emit the subtree first in this case.
2435 /// \param WillNegate   Is true if are called when the result of this
2436 ///                     subexpression must be negated. This happens when the
2437 ///                     outer expression is an OR. We can use this fact to know
2438 ///                     that we have a double negation (or (or ...) ...) that
2439 ///                     can be implemented for free.
2440 static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
2441                                bool &MustBeFirst, bool WillNegate,
2442                                unsigned Depth = 0) {
2443   if (!Val.hasOneUse())
2444     return false;
2445   unsigned Opcode = Val->getOpcode();
2446   if (Opcode == ISD::SETCC) {
2447     if (Val->getOperand(0).getValueType() == MVT::f128)
2448       return false;
2449     CanNegate = true;
2450     MustBeFirst = false;
2451     return true;
2452   }
2453   // Protect against exponential runtime and stack overflow.
2454   if (Depth > 6)
2455     return false;
2456   if (Opcode == ISD::AND || Opcode == ISD::OR) {
2457     bool IsOR = Opcode == ISD::OR;
2458     SDValue O0 = Val->getOperand(0);
2459     SDValue O1 = Val->getOperand(1);
2460     bool CanNegateL;
2461     bool MustBeFirstL;
2462     if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
2463       return false;
2464     bool CanNegateR;
2465     bool MustBeFirstR;
2466     if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
2467       return false;
2468 
2469     if (MustBeFirstL && MustBeFirstR)
2470       return false;
2471 
2472     if (IsOR) {
2473       // For an OR expression we need to be able to naturally negate at least
2474       // one side or we cannot do the transformation at all.
2475       if (!CanNegateL && !CanNegateR)
2476         return false;
2477       // If we the result of the OR will be negated and we can naturally negate
2478       // the leafs, then this sub-tree as a whole negates naturally.
2479       CanNegate = WillNegate && CanNegateL && CanNegateR;
2480       // If we cannot naturally negate the whole sub-tree, then this must be
2481       // emitted first.
2482       MustBeFirst = !CanNegate;
2483     } else {
2484       assert(Opcode == ISD::AND && "Must be OR or AND");
2485       // We cannot naturally negate an AND operation.
2486       CanNegate = false;
2487       MustBeFirst = MustBeFirstL || MustBeFirstR;
2488     }
2489     return true;
2490   }
2491   return false;
2492 }
2493 
2494 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
2495 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
2496 /// Tries to transform the given i1 producing node @p Val to a series compare
2497 /// and conditional compare operations. @returns an NZCV flags producing node
2498 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
2499 /// transformation was not possible.
2500 /// \p Negate is true if we want this sub-tree being negated just by changing
2501 /// SETCC conditions.
2502 static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
2503     AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
2504     AArch64CC::CondCode Predicate) {
2505   // We're at a tree leaf, produce a conditional comparison operation.
2506   unsigned Opcode = Val->getOpcode();
2507   if (Opcode == ISD::SETCC) {
2508     SDValue LHS = Val->getOperand(0);
2509     SDValue RHS = Val->getOperand(1);
2510     ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
2511     bool isInteger = LHS.getValueType().isInteger();
2512     if (Negate)
2513       CC = getSetCCInverse(CC, LHS.getValueType());
2514     SDLoc DL(Val);
2515     // Determine OutCC and handle FP special case.
2516     if (isInteger) {
2517       OutCC = changeIntCCToAArch64CC(CC);
2518     } else {
2519       assert(LHS.getValueType().isFloatingPoint());
2520       AArch64CC::CondCode ExtraCC;
2521       changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
2522       // Some floating point conditions can't be tested with a single condition
2523       // code. Construct an additional comparison in this case.
2524       if (ExtraCC != AArch64CC::AL) {
2525         SDValue ExtraCmp;
2526         if (!CCOp.getNode())
2527           ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
2528         else
2529           ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
2530                                                ExtraCC, DL, DAG);
2531         CCOp = ExtraCmp;
2532         Predicate = ExtraCC;
2533       }
2534     }
2535 
2536     // Produce a normal comparison if we are first in the chain
2537     if (!CCOp)
2538       return emitComparison(LHS, RHS, CC, DL, DAG);
2539     // Otherwise produce a ccmp.
2540     return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
2541                                      DAG);
2542   }
2543   assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
2544 
2545   bool IsOR = Opcode == ISD::OR;
2546 
2547   SDValue LHS = Val->getOperand(0);
2548   bool CanNegateL;
2549   bool MustBeFirstL;
2550   bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
2551   assert(ValidL && "Valid conjunction/disjunction tree");
2552   (void)ValidL;
2553 
2554   SDValue RHS = Val->getOperand(1);
2555   bool CanNegateR;
2556   bool MustBeFirstR;
2557   bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
2558   assert(ValidR && "Valid conjunction/disjunction tree");
2559   (void)ValidR;
2560 
2561   // Swap sub-tree that must come first to the right side.
2562   if (MustBeFirstL) {
2563     assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
2564     std::swap(LHS, RHS);
2565     std::swap(CanNegateL, CanNegateR);
2566     std::swap(MustBeFirstL, MustBeFirstR);
2567   }
2568 
2569   bool NegateR;
2570   bool NegateAfterR;
2571   bool NegateL;
2572   bool NegateAfterAll;
2573   if (Opcode == ISD::OR) {
2574     // Swap the sub-tree that we can negate naturally to the left.
2575     if (!CanNegateL) {
2576       assert(CanNegateR && "at least one side must be negatable");
2577       assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
2578       assert(!Negate);
2579       std::swap(LHS, RHS);
2580       NegateR = false;
2581       NegateAfterR = true;
2582     } else {
2583       // Negate the left sub-tree if possible, otherwise negate the result.
2584       NegateR = CanNegateR;
2585       NegateAfterR = !CanNegateR;
2586     }
2587     NegateL = true;
2588     NegateAfterAll = !Negate;
2589   } else {
2590     assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
2591     assert(!Negate && "Valid conjunction/disjunction tree");
2592 
2593     NegateL = false;
2594     NegateR = false;
2595     NegateAfterR = false;
2596     NegateAfterAll = false;
2597   }
2598 
2599   // Emit sub-trees.
2600   AArch64CC::CondCode RHSCC;
2601   SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
2602   if (NegateAfterR)
2603     RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
2604   SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
2605   if (NegateAfterAll)
2606     OutCC = AArch64CC::getInvertedCondCode(OutCC);
2607   return CmpL;
2608 }
2609 
2610 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
2611 /// In some cases this is even possible with OR operations in the expression.
2612 /// See \ref AArch64CCMP.
2613 /// \see emitConjunctionRec().
2614 static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val,
2615                                AArch64CC::CondCode &OutCC) {
2616   bool DummyCanNegate;
2617   bool DummyMustBeFirst;
2618   if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
2619     return SDValue();
2620 
2621   return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
2622 }
2623 
2624 /// @}
2625 
2626 /// Returns how profitable it is to fold a comparison's operand's shift and/or
2627 /// extension operations.
2628 static unsigned getCmpOperandFoldingProfit(SDValue Op) {
2629   auto isSupportedExtend = [&](SDValue V) {
2630     if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
2631       return true;
2632 
2633     if (V.getOpcode() == ISD::AND)
2634       if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
2635         uint64_t Mask = MaskCst->getZExtValue();
2636         return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
2637       }
2638 
2639     return false;
2640   };
2641 
2642   if (!Op.hasOneUse())
2643     return 0;
2644 
2645   if (isSupportedExtend(Op))
2646     return 1;
2647 
2648   unsigned Opc = Op.getOpcode();
2649   if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
2650     if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
2651       uint64_t Shift = ShiftCst->getZExtValue();
2652       if (isSupportedExtend(Op.getOperand(0)))
2653         return (Shift <= 4) ? 2 : 1;
2654       EVT VT = Op.getValueType();
2655       if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
2656         return 1;
2657     }
2658 
2659   return 0;
2660 }
2661 
2662 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
2663                              SDValue &AArch64cc, SelectionDAG &DAG,
2664                              const SDLoc &dl) {
2665   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
2666     EVT VT = RHS.getValueType();
2667     uint64_t C = RHSC->getZExtValue();
2668     if (!isLegalArithImmed(C)) {
2669       // Constant does not fit, try adjusting it by one?
2670       switch (CC) {
2671       default:
2672         break;
2673       case ISD::SETLT:
2674       case ISD::SETGE:
2675         if ((VT == MVT::i32 && C != 0x80000000 &&
2676              isLegalArithImmed((uint32_t)(C - 1))) ||
2677             (VT == MVT::i64 && C != 0x80000000ULL &&
2678              isLegalArithImmed(C - 1ULL))) {
2679           CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
2680           C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
2681           RHS = DAG.getConstant(C, dl, VT);
2682         }
2683         break;
2684       case ISD::SETULT:
2685       case ISD::SETUGE:
2686         if ((VT == MVT::i32 && C != 0 &&
2687              isLegalArithImmed((uint32_t)(C - 1))) ||
2688             (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
2689           CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
2690           C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
2691           RHS = DAG.getConstant(C, dl, VT);
2692         }
2693         break;
2694       case ISD::SETLE:
2695       case ISD::SETGT:
2696         if ((VT == MVT::i32 && C != INT32_MAX &&
2697              isLegalArithImmed((uint32_t)(C + 1))) ||
2698             (VT == MVT::i64 && C != INT64_MAX &&
2699              isLegalArithImmed(C + 1ULL))) {
2700           CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
2701           C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
2702           RHS = DAG.getConstant(C, dl, VT);
2703         }
2704         break;
2705       case ISD::SETULE:
2706       case ISD::SETUGT:
2707         if ((VT == MVT::i32 && C != UINT32_MAX &&
2708              isLegalArithImmed((uint32_t)(C + 1))) ||
2709             (VT == MVT::i64 && C != UINT64_MAX &&
2710              isLegalArithImmed(C + 1ULL))) {
2711           CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
2712           C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
2713           RHS = DAG.getConstant(C, dl, VT);
2714         }
2715         break;
2716       }
2717     }
2718   }
2719 
2720   // Comparisons are canonicalized so that the RHS operand is simpler than the
2721   // LHS one, the extreme case being when RHS is an immediate. However, AArch64
2722   // can fold some shift+extend operations on the RHS operand, so swap the
2723   // operands if that can be done.
2724   //
2725   // For example:
2726   //    lsl     w13, w11, #1
2727   //    cmp     w13, w12
2728   // can be turned into:
2729   //    cmp     w12, w11, lsl #1
2730   if (!isa<ConstantSDNode>(RHS) ||
2731       !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
2732     SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
2733 
2734     if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) {
2735       std::swap(LHS, RHS);
2736       CC = ISD::getSetCCSwappedOperands(CC);
2737     }
2738   }
2739 
2740   SDValue Cmp;
2741   AArch64CC::CondCode AArch64CC;
2742   if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
2743     const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
2744 
2745     // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
2746     // For the i8 operand, the largest immediate is 255, so this can be easily
2747     // encoded in the compare instruction. For the i16 operand, however, the
2748     // largest immediate cannot be encoded in the compare.
2749     // Therefore, use a sign extending load and cmn to avoid materializing the
2750     // -1 constant. For example,
2751     // movz w1, #65535
2752     // ldrh w0, [x0, #0]
2753     // cmp w0, w1
2754     // >
2755     // ldrsh w0, [x0, #0]
2756     // cmn w0, #1
2757     // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
2758     // if and only if (sext LHS) == (sext RHS). The checks are in place to
2759     // ensure both the LHS and RHS are truly zero extended and to make sure the
2760     // transformation is profitable.
2761     if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
2762         cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
2763         cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
2764         LHS.getNode()->hasNUsesOfValue(1, 0)) {
2765       int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
2766       if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
2767         SDValue SExt =
2768             DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
2769                         DAG.getValueType(MVT::i16));
2770         Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
2771                                                    RHS.getValueType()),
2772                              CC, dl, DAG);
2773         AArch64CC = changeIntCCToAArch64CC(CC);
2774       }
2775     }
2776 
2777     if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
2778       if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
2779         if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
2780           AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
2781       }
2782     }
2783   }
2784 
2785   if (!Cmp) {
2786     Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
2787     AArch64CC = changeIntCCToAArch64CC(CC);
2788   }
2789   AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
2790   return Cmp;
2791 }
2792 
2793 static std::pair<SDValue, SDValue>
2794 getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
2795   assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
2796          "Unsupported value type");
2797   SDValue Value, Overflow;
2798   SDLoc DL(Op);
2799   SDValue LHS = Op.getOperand(0);
2800   SDValue RHS = Op.getOperand(1);
2801   unsigned Opc = 0;
2802   switch (Op.getOpcode()) {
2803   default:
2804     llvm_unreachable("Unknown overflow instruction!");
2805   case ISD::SADDO:
2806     Opc = AArch64ISD::ADDS;
2807     CC = AArch64CC::VS;
2808     break;
2809   case ISD::UADDO:
2810     Opc = AArch64ISD::ADDS;
2811     CC = AArch64CC::HS;
2812     break;
2813   case ISD::SSUBO:
2814     Opc = AArch64ISD::SUBS;
2815     CC = AArch64CC::VS;
2816     break;
2817   case ISD::USUBO:
2818     Opc = AArch64ISD::SUBS;
2819     CC = AArch64CC::LO;
2820     break;
2821   // Multiply needs a little bit extra work.
2822   case ISD::SMULO:
2823   case ISD::UMULO: {
2824     CC = AArch64CC::NE;
2825     bool IsSigned = Op.getOpcode() == ISD::SMULO;
2826     if (Op.getValueType() == MVT::i32) {
2827       unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
2828       // For a 32 bit multiply with overflow check we want the instruction
2829       // selector to generate a widening multiply (SMADDL/UMADDL). For that we
2830       // need to generate the following pattern:
2831       // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
2832       LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
2833       RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
2834       SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
2835       SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
2836                                 DAG.getConstant(0, DL, MVT::i64));
2837       // On AArch64 the upper 32 bits are always zero extended for a 32 bit
2838       // operation. We need to clear out the upper 32 bits, because we used a
2839       // widening multiply that wrote all 64 bits. In the end this should be a
2840       // noop.
2841       Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
2842       if (IsSigned) {
2843         // The signed overflow check requires more than just a simple check for
2844         // any bit set in the upper 32 bits of the result. These bits could be
2845         // just the sign bits of a negative number. To perform the overflow
2846         // check we have to arithmetic shift right the 32nd bit of the result by
2847         // 31 bits. Then we compare the result to the upper 32 bits.
2848         SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
2849                                         DAG.getConstant(32, DL, MVT::i64));
2850         UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
2851         SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
2852                                         DAG.getConstant(31, DL, MVT::i64));
2853         // It is important that LowerBits is last, otherwise the arithmetic
2854         // shift will not be folded into the compare (SUBS).
2855         SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
2856         Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
2857                        .getValue(1);
2858       } else {
2859         // The overflow check for unsigned multiply is easy. We only need to
2860         // check if any of the upper 32 bits are set. This can be done with a
2861         // CMP (shifted register). For that we need to generate the following
2862         // pattern:
2863         // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
2864         SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
2865                                         DAG.getConstant(32, DL, MVT::i64));
2866         SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2867         Overflow =
2868             DAG.getNode(AArch64ISD::SUBS, DL, VTs,
2869                         DAG.getConstant(0, DL, MVT::i64),
2870                         UpperBits).getValue(1);
2871       }
2872       break;
2873     }
2874     assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
2875     // For the 64 bit multiply
2876     Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
2877     if (IsSigned) {
2878       SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
2879       SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
2880                                       DAG.getConstant(63, DL, MVT::i64));
2881       // It is important that LowerBits is last, otherwise the arithmetic
2882       // shift will not be folded into the compare (SUBS).
2883       SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2884       Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
2885                      .getValue(1);
2886     } else {
2887       SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
2888       SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2889       Overflow =
2890           DAG.getNode(AArch64ISD::SUBS, DL, VTs,
2891                       DAG.getConstant(0, DL, MVT::i64),
2892                       UpperBits).getValue(1);
2893     }
2894     break;
2895   }
2896   } // switch (...)
2897 
2898   if (Opc) {
2899     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
2900 
2901     // Emit the AArch64 operation with overflow check.
2902     Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
2903     Overflow = Value.getValue(1);
2904   }
2905   return std::make_pair(Value, Overflow);
2906 }
2907 
2908 SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
2909   if (useSVEForFixedLengthVectorVT(Op.getValueType()))
2910     return LowerToScalableOp(Op, DAG);
2911 
2912   SDValue Sel = Op.getOperand(0);
2913   SDValue Other = Op.getOperand(1);
2914   SDLoc dl(Sel);
2915 
2916   // If the operand is an overflow checking operation, invert the condition
2917   // code and kill the Not operation. I.e., transform:
2918   // (xor (overflow_op_bool, 1))
2919   //   -->
2920   // (csel 1, 0, invert(cc), overflow_op_bool)
2921   // ... which later gets transformed to just a cset instruction with an
2922   // inverted condition code, rather than a cset + eor sequence.
2923   if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) {
2924     // Only lower legal XALUO ops.
2925     if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
2926       return SDValue();
2927 
2928     SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
2929     SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
2930     AArch64CC::CondCode CC;
2931     SDValue Value, Overflow;
2932     std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
2933     SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
2934     return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
2935                        CCVal, Overflow);
2936   }
2937   // If neither operand is a SELECT_CC, give up.
2938   if (Sel.getOpcode() != ISD::SELECT_CC)
2939     std::swap(Sel, Other);
2940   if (Sel.getOpcode() != ISD::SELECT_CC)
2941     return Op;
2942 
2943   // The folding we want to perform is:
2944   // (xor x, (select_cc a, b, cc, 0, -1) )
2945   //   -->
2946   // (csel x, (xor x, -1), cc ...)
2947   //
2948   // The latter will get matched to a CSINV instruction.
2949 
2950   ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
2951   SDValue LHS = Sel.getOperand(0);
2952   SDValue RHS = Sel.getOperand(1);
2953   SDValue TVal = Sel.getOperand(2);
2954   SDValue FVal = Sel.getOperand(3);
2955 
2956   // FIXME: This could be generalized to non-integer comparisons.
2957   if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
2958     return Op;
2959 
2960   ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
2961   ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
2962 
2963   // The values aren't constants, this isn't the pattern we're looking for.
2964   if (!CFVal || !CTVal)
2965     return Op;
2966 
2967   // We can commute the SELECT_CC by inverting the condition.  This
2968   // might be needed to make this fit into a CSINV pattern.
2969   if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
2970     std::swap(TVal, FVal);
2971     std::swap(CTVal, CFVal);
2972     CC = ISD::getSetCCInverse(CC, LHS.getValueType());
2973   }
2974 
2975   // If the constants line up, perform the transform!
2976   if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
2977     SDValue CCVal;
2978     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
2979 
2980     FVal = Other;
2981     TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
2982                        DAG.getConstant(-1ULL, dl, Other.getValueType()));
2983 
2984     return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
2985                        CCVal, Cmp);
2986   }
2987 
2988   return Op;
2989 }
2990 
2991 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
2992   EVT VT = Op.getValueType();
2993 
2994   // Let legalize expand this if it isn't a legal type yet.
2995   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
2996     return SDValue();
2997 
2998   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
2999 
3000   unsigned Opc;
3001   bool ExtraOp = false;
3002   switch (Op.getOpcode()) {
3003   default:
3004     llvm_unreachable("Invalid code");
3005   case ISD::ADDC:
3006     Opc = AArch64ISD::ADDS;
3007     break;
3008   case ISD::SUBC:
3009     Opc = AArch64ISD::SUBS;
3010     break;
3011   case ISD::ADDE:
3012     Opc = AArch64ISD::ADCS;
3013     ExtraOp = true;
3014     break;
3015   case ISD::SUBE:
3016     Opc = AArch64ISD::SBCS;
3017     ExtraOp = true;
3018     break;
3019   }
3020 
3021   if (!ExtraOp)
3022     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
3023   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
3024                      Op.getOperand(2));
3025 }
3026 
3027 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
3028   // Let legalize expand this if it isn't a legal type yet.
3029   if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
3030     return SDValue();
3031 
3032   SDLoc dl(Op);
3033   AArch64CC::CondCode CC;
3034   // The actual operation that sets the overflow or carry flag.
3035   SDValue Value, Overflow;
3036   std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
3037 
3038   // We use 0 and 1 as false and true values.
3039   SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3040   SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3041 
3042   // We use an inverted condition, because the conditional select is inverted
3043   // too. This will allow it to be selected to a single instruction:
3044   // CSINC Wd, WZR, WZR, invert(cond).
3045   SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3046   Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
3047                          CCVal, Overflow);
3048 
3049   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
3050   return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
3051 }
3052 
3053 // Prefetch operands are:
3054 // 1: Address to prefetch
3055 // 2: bool isWrite
3056 // 3: int locality (0 = no locality ... 3 = extreme locality)
3057 // 4: bool isDataCache
3058 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
3059   SDLoc DL(Op);
3060   unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
3061   unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
3062   unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
3063 
3064   bool IsStream = !Locality;
3065   // When the locality number is set
3066   if (Locality) {
3067     // The front-end should have filtered out the out-of-range values
3068     assert(Locality <= 3 && "Prefetch locality out-of-range");
3069     // The locality degree is the opposite of the cache speed.
3070     // Put the number the other way around.
3071     // The encoding starts at 0 for level 1
3072     Locality = 3 - Locality;
3073   }
3074 
3075   // built the mask value encoding the expected behavior.
3076   unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit
3077                    (!IsData << 3) |     // IsDataCache bit
3078                    (Locality << 1) |    // Cache level bits
3079                    (unsigned)IsStream;  // Stream bit
3080   return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
3081                      DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
3082 }
3083 
3084 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
3085                                               SelectionDAG &DAG) const {
3086   if (Op.getValueType().isScalableVector())
3087     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
3088 
3089   assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
3090   return SDValue();
3091 }
3092 
3093 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
3094                                              SelectionDAG &DAG) const {
3095   if (Op.getValueType().isScalableVector())
3096     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
3097 
3098   bool IsStrict = Op->isStrictFPOpcode();
3099   SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
3100   EVT SrcVT = SrcVal.getValueType();
3101 
3102   if (SrcVT != MVT::f128) {
3103     // Expand cases where the input is a vector bigger than NEON.
3104     if (useSVEForFixedLengthVectorVT(SrcVT))
3105       return SDValue();
3106 
3107     // It's legal except when f128 is involved
3108     return Op;
3109   }
3110 
3111   return SDValue();
3112 }
3113 
3114 SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
3115                                                     SelectionDAG &DAG) const {
3116   // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
3117   // Any additional optimization in this function should be recorded
3118   // in the cost tables.
3119   EVT InVT = Op.getOperand(0).getValueType();
3120   EVT VT = Op.getValueType();
3121 
3122   if (VT.isScalableVector()) {
3123     unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
3124                           ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
3125                           : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
3126     return LowerToPredicatedOp(Op, DAG, Opcode);
3127   }
3128 
3129   unsigned NumElts = InVT.getVectorNumElements();
3130 
3131   // f16 conversions are promoted to f32 when full fp16 is not supported.
3132   if (InVT.getVectorElementType() == MVT::f16 &&
3133       !Subtarget->hasFullFP16()) {
3134     MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
3135     SDLoc dl(Op);
3136     return DAG.getNode(
3137         Op.getOpcode(), dl, Op.getValueType(),
3138         DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
3139   }
3140 
3141   uint64_t VTSize = VT.getFixedSizeInBits();
3142   uint64_t InVTSize = InVT.getFixedSizeInBits();
3143   if (VTSize < InVTSize) {
3144     SDLoc dl(Op);
3145     SDValue Cv =
3146         DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
3147                     Op.getOperand(0));
3148     return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
3149   }
3150 
3151   if (VTSize > InVTSize) {
3152     SDLoc dl(Op);
3153     MVT ExtVT =
3154         MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
3155                          VT.getVectorNumElements());
3156     SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
3157     return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
3158   }
3159 
3160   // Type changing conversions are illegal.
3161   return Op;
3162 }
3163 
3164 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
3165                                               SelectionDAG &DAG) const {
3166   bool IsStrict = Op->isStrictFPOpcode();
3167   SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
3168 
3169   if (SrcVal.getValueType().isVector())
3170     return LowerVectorFP_TO_INT(Op, DAG);
3171 
3172   // f16 conversions are promoted to f32 when full fp16 is not supported.
3173   if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
3174     assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
3175     SDLoc dl(Op);
3176     return DAG.getNode(
3177         Op.getOpcode(), dl, Op.getValueType(),
3178         DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
3179   }
3180 
3181   if (SrcVal.getValueType() != MVT::f128) {
3182     // It's legal except when f128 is involved
3183     return Op;
3184   }
3185 
3186   return SDValue();
3187 }
3188 
3189 SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
3190                                                     SelectionDAG &DAG) const {
3191   // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
3192   // Any additional optimization in this function should be recorded
3193   // in the cost tables.
3194   EVT VT = Op.getValueType();
3195   SDLoc dl(Op);
3196   SDValue In = Op.getOperand(0);
3197   EVT InVT = In.getValueType();
3198   unsigned Opc = Op.getOpcode();
3199   bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
3200 
3201   if (VT.isScalableVector()) {
3202     if (InVT.getVectorElementType() == MVT::i1) {
3203       // We can't directly extend an SVE predicate; extend it first.
3204       unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3205       EVT CastVT = getPromotedVTForPredicate(InVT);
3206       In = DAG.getNode(CastOpc, dl, CastVT, In);
3207       return DAG.getNode(Opc, dl, VT, In);
3208     }
3209 
3210     unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
3211                                : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
3212     return LowerToPredicatedOp(Op, DAG, Opcode);
3213   }
3214 
3215   uint64_t VTSize = VT.getFixedSizeInBits();
3216   uint64_t InVTSize = InVT.getFixedSizeInBits();
3217   if (VTSize < InVTSize) {
3218     MVT CastVT =
3219         MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
3220                          InVT.getVectorNumElements());
3221     In = DAG.getNode(Opc, dl, CastVT, In);
3222     return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
3223   }
3224 
3225   if (VTSize > InVTSize) {
3226     unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3227     EVT CastVT = VT.changeVectorElementTypeToInteger();
3228     In = DAG.getNode(CastOpc, dl, CastVT, In);
3229     return DAG.getNode(Opc, dl, VT, In);
3230   }
3231 
3232   return Op;
3233 }
3234 
3235 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
3236                                             SelectionDAG &DAG) const {
3237   if (Op.getValueType().isVector())
3238     return LowerVectorINT_TO_FP(Op, DAG);
3239 
3240   bool IsStrict = Op->isStrictFPOpcode();
3241   SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
3242 
3243   // f16 conversions are promoted to f32 when full fp16 is not supported.
3244   if (Op.getValueType() == MVT::f16 &&
3245       !Subtarget->hasFullFP16()) {
3246     assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
3247     SDLoc dl(Op);
3248     return DAG.getNode(
3249         ISD::FP_ROUND, dl, MVT::f16,
3250         DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal),
3251         DAG.getIntPtrConstant(0, dl));
3252   }
3253 
3254   // i128 conversions are libcalls.
3255   if (SrcVal.getValueType() == MVT::i128)
3256     return SDValue();
3257 
3258   // Other conversions are legal, unless it's to the completely software-based
3259   // fp128.
3260   if (Op.getValueType() != MVT::f128)
3261     return Op;
3262   return SDValue();
3263 }
3264 
3265 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
3266                                             SelectionDAG &DAG) const {
3267   // For iOS, we want to call an alternative entry point: __sincos_stret,
3268   // which returns the values in two S / D registers.
3269   SDLoc dl(Op);
3270   SDValue Arg = Op.getOperand(0);
3271   EVT ArgVT = Arg.getValueType();
3272   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
3273 
3274   ArgListTy Args;
3275   ArgListEntry Entry;
3276 
3277   Entry.Node = Arg;
3278   Entry.Ty = ArgTy;
3279   Entry.IsSExt = false;
3280   Entry.IsZExt = false;
3281   Args.push_back(Entry);
3282 
3283   RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
3284                                         : RTLIB::SINCOS_STRET_F32;
3285   const char *LibcallName = getLibcallName(LC);
3286   SDValue Callee =
3287       DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
3288 
3289   StructType *RetTy = StructType::get(ArgTy, ArgTy);
3290   TargetLowering::CallLoweringInfo CLI(DAG);
3291   CLI.setDebugLoc(dl)
3292       .setChain(DAG.getEntryNode())
3293       .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
3294 
3295   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3296   return CallResult.first;
3297 }
3298 
3299 static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
3300   EVT OpVT = Op.getValueType();
3301   if (OpVT != MVT::f16 && OpVT != MVT::bf16)
3302     return SDValue();
3303 
3304   assert(Op.getOperand(0).getValueType() == MVT::i16);
3305   SDLoc DL(Op);
3306 
3307   Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
3308   Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
3309   return SDValue(
3310       DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, OpVT, Op,
3311                          DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
3312       0);
3313 }
3314 
3315 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
3316   if (OrigVT.getSizeInBits() >= 64)
3317     return OrigVT;
3318 
3319   assert(OrigVT.isSimple() && "Expecting a simple value type");
3320 
3321   MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
3322   switch (OrigSimpleTy) {
3323   default: llvm_unreachable("Unexpected Vector Type");
3324   case MVT::v2i8:
3325   case MVT::v2i16:
3326      return MVT::v2i32;
3327   case MVT::v4i8:
3328     return  MVT::v4i16;
3329   }
3330 }
3331 
3332 static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
3333                                                  const EVT &OrigTy,
3334                                                  const EVT &ExtTy,
3335                                                  unsigned ExtOpcode) {
3336   // The vector originally had a size of OrigTy. It was then extended to ExtTy.
3337   // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
3338   // 64-bits we need to insert a new extension so that it will be 64-bits.
3339   assert(ExtTy.is128BitVector() && "Unexpected extension size");
3340   if (OrigTy.getSizeInBits() >= 64)
3341     return N;
3342 
3343   // Must extend size to at least 64 bits to be used as an operand for VMULL.
3344   EVT NewVT = getExtensionTo64Bits(OrigTy);
3345 
3346   return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
3347 }
3348 
3349 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
3350                                    bool isSigned) {
3351   EVT VT = N->getValueType(0);
3352 
3353   if (N->getOpcode() != ISD::BUILD_VECTOR)
3354     return false;
3355 
3356   for (const SDValue &Elt : N->op_values()) {
3357     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
3358       unsigned EltSize = VT.getScalarSizeInBits();
3359       unsigned HalfSize = EltSize / 2;
3360       if (isSigned) {
3361         if (!isIntN(HalfSize, C->getSExtValue()))
3362           return false;
3363       } else {
3364         if (!isUIntN(HalfSize, C->getZExtValue()))
3365           return false;
3366       }
3367       continue;
3368     }
3369     return false;
3370   }
3371 
3372   return true;
3373 }
3374 
3375 static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
3376   if (N->getOpcode() == ISD::SIGN_EXTEND ||
3377       N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
3378     return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
3379                                              N->getOperand(0)->getValueType(0),
3380                                              N->getValueType(0),
3381                                              N->getOpcode());
3382 
3383   assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
3384   EVT VT = N->getValueType(0);
3385   SDLoc dl(N);
3386   unsigned EltSize = VT.getScalarSizeInBits() / 2;
3387   unsigned NumElts = VT.getVectorNumElements();
3388   MVT TruncVT = MVT::getIntegerVT(EltSize);
3389   SmallVector<SDValue, 8> Ops;
3390   for (unsigned i = 0; i != NumElts; ++i) {
3391     ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
3392     const APInt &CInt = C->getAPIntValue();
3393     // Element types smaller than 32 bits are not legal, so use i32 elements.
3394     // The values are implicitly truncated so sext vs. zext doesn't matter.
3395     Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
3396   }
3397   return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
3398 }
3399 
3400 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
3401   return N->getOpcode() == ISD::SIGN_EXTEND ||
3402          N->getOpcode() == ISD::ANY_EXTEND ||
3403          isExtendedBUILD_VECTOR(N, DAG, true);
3404 }
3405 
3406 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
3407   return N->getOpcode() == ISD::ZERO_EXTEND ||
3408          N->getOpcode() == ISD::ANY_EXTEND ||
3409          isExtendedBUILD_VECTOR(N, DAG, false);
3410 }
3411 
3412 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
3413   unsigned Opcode = N->getOpcode();
3414   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
3415     SDNode *N0 = N->getOperand(0).getNode();
3416     SDNode *N1 = N->getOperand(1).getNode();
3417     return N0->hasOneUse() && N1->hasOneUse() &&
3418       isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
3419   }
3420   return false;
3421 }
3422 
3423 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
3424   unsigned Opcode = N->getOpcode();
3425   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
3426     SDNode *N0 = N->getOperand(0).getNode();
3427     SDNode *N1 = N->getOperand(1).getNode();
3428     return N0->hasOneUse() && N1->hasOneUse() &&
3429       isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
3430   }
3431   return false;
3432 }
3433 
3434 SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
3435                                                 SelectionDAG &DAG) const {
3436   // The rounding mode is in bits 23:22 of the FPSCR.
3437   // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
3438   // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
3439   // so that the shift + and get folded into a bitfield extract.
3440   SDLoc dl(Op);
3441 
3442   SDValue Chain = Op.getOperand(0);
3443   SDValue FPCR_64 = DAG.getNode(
3444       ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
3445       {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
3446   Chain = FPCR_64.getValue(1);
3447   SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
3448   SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
3449                                   DAG.getConstant(1U << 22, dl, MVT::i32));
3450   SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
3451                               DAG.getConstant(22, dl, MVT::i32));
3452   SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
3453                             DAG.getConstant(3, dl, MVT::i32));
3454   return DAG.getMergeValues({AND, Chain}, dl);
3455 }
3456 
3457 SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
3458   EVT VT = Op.getValueType();
3459 
3460   // If SVE is available then i64 vector multiplications can also be made legal.
3461   bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64;
3462 
3463   if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
3464     return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED, OverrideNEON);
3465 
3466   // Multiplications are only custom-lowered for 128-bit vectors so that
3467   // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
3468   assert(VT.is128BitVector() && VT.isInteger() &&
3469          "unexpected type for custom-lowering ISD::MUL");
3470   SDNode *N0 = Op.getOperand(0).getNode();
3471   SDNode *N1 = Op.getOperand(1).getNode();
3472   unsigned NewOpc = 0;
3473   bool isMLA = false;
3474   bool isN0SExt = isSignExtended(N0, DAG);
3475   bool isN1SExt = isSignExtended(N1, DAG);
3476   if (isN0SExt && isN1SExt)
3477     NewOpc = AArch64ISD::SMULL;
3478   else {
3479     bool isN0ZExt = isZeroExtended(N0, DAG);
3480     bool isN1ZExt = isZeroExtended(N1, DAG);
3481     if (isN0ZExt && isN1ZExt)
3482       NewOpc = AArch64ISD::UMULL;
3483     else if (isN1SExt || isN1ZExt) {
3484       // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
3485       // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
3486       if (isN1SExt && isAddSubSExt(N0, DAG)) {
3487         NewOpc = AArch64ISD::SMULL;
3488         isMLA = true;
3489       } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
3490         NewOpc =  AArch64ISD::UMULL;
3491         isMLA = true;
3492       } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
3493         std::swap(N0, N1);
3494         NewOpc =  AArch64ISD::UMULL;
3495         isMLA = true;
3496       }
3497     }
3498 
3499     if (!NewOpc) {
3500       if (VT == MVT::v2i64)
3501         // Fall through to expand this.  It is not legal.
3502         return SDValue();
3503       else
3504         // Other vector multiplications are legal.
3505         return Op;
3506     }
3507   }
3508 
3509   // Legalize to a S/UMULL instruction
3510   SDLoc DL(Op);
3511   SDValue Op0;
3512   SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
3513   if (!isMLA) {
3514     Op0 = skipExtensionForVectorMULL(N0, DAG);
3515     assert(Op0.getValueType().is64BitVector() &&
3516            Op1.getValueType().is64BitVector() &&
3517            "unexpected types for extended operands to VMULL");
3518     return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
3519   }
3520   // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
3521   // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
3522   // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
3523   SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
3524   SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
3525   EVT Op1VT = Op1.getValueType();
3526   return DAG.getNode(N0->getOpcode(), DL, VT,
3527                      DAG.getNode(NewOpc, DL, VT,
3528                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
3529                      DAG.getNode(NewOpc, DL, VT,
3530                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
3531 }
3532 
3533 static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
3534                                int Pattern) {
3535   return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
3536                      DAG.getTargetConstant(Pattern, DL, MVT::i32));
3537 }
3538 
3539 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
3540                                                      SelectionDAG &DAG) const {
3541   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3542   SDLoc dl(Op);
3543   switch (IntNo) {
3544   default: return SDValue();    // Don't custom lower most intrinsics.
3545   case Intrinsic::thread_pointer: {
3546     EVT PtrVT = getPointerTy(DAG.getDataLayout());
3547     return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
3548   }
3549   case Intrinsic::aarch64_neon_abs: {
3550     EVT Ty = Op.getValueType();
3551     if (Ty == MVT::i64) {
3552       SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
3553                                    Op.getOperand(1));
3554       Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
3555       return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
3556     } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
3557       return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
3558     } else {
3559       report_fatal_error("Unexpected type for AArch64 NEON intrinic");
3560     }
3561   }
3562   case Intrinsic::aarch64_neon_smax:
3563     return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
3564                        Op.getOperand(1), Op.getOperand(2));
3565   case Intrinsic::aarch64_neon_umax:
3566     return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
3567                        Op.getOperand(1), Op.getOperand(2));
3568   case Intrinsic::aarch64_neon_smin:
3569     return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
3570                        Op.getOperand(1), Op.getOperand(2));
3571   case Intrinsic::aarch64_neon_umin:
3572     return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
3573                        Op.getOperand(1), Op.getOperand(2));
3574 
3575   case Intrinsic::aarch64_sve_sunpkhi:
3576     return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
3577                        Op.getOperand(1));
3578   case Intrinsic::aarch64_sve_sunpklo:
3579     return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
3580                        Op.getOperand(1));
3581   case Intrinsic::aarch64_sve_uunpkhi:
3582     return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
3583                        Op.getOperand(1));
3584   case Intrinsic::aarch64_sve_uunpklo:
3585     return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
3586                        Op.getOperand(1));
3587   case Intrinsic::aarch64_sve_clasta_n:
3588     return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
3589                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3590   case Intrinsic::aarch64_sve_clastb_n:
3591     return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
3592                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3593   case Intrinsic::aarch64_sve_lasta:
3594     return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
3595                        Op.getOperand(1), Op.getOperand(2));
3596   case Intrinsic::aarch64_sve_lastb:
3597     return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
3598                        Op.getOperand(1), Op.getOperand(2));
3599   case Intrinsic::aarch64_sve_rev:
3600     return DAG.getNode(AArch64ISD::REV, dl, Op.getValueType(),
3601                        Op.getOperand(1));
3602   case Intrinsic::aarch64_sve_tbl:
3603     return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
3604                        Op.getOperand(1), Op.getOperand(2));
3605   case Intrinsic::aarch64_sve_trn1:
3606     return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
3607                        Op.getOperand(1), Op.getOperand(2));
3608   case Intrinsic::aarch64_sve_trn2:
3609     return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
3610                        Op.getOperand(1), Op.getOperand(2));
3611   case Intrinsic::aarch64_sve_uzp1:
3612     return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
3613                        Op.getOperand(1), Op.getOperand(2));
3614   case Intrinsic::aarch64_sve_uzp2:
3615     return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
3616                        Op.getOperand(1), Op.getOperand(2));
3617   case Intrinsic::aarch64_sve_zip1:
3618     return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
3619                        Op.getOperand(1), Op.getOperand(2));
3620   case Intrinsic::aarch64_sve_zip2:
3621     return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
3622                        Op.getOperand(1), Op.getOperand(2));
3623   case Intrinsic::aarch64_sve_ptrue:
3624     return DAG.getNode(AArch64ISD::PTRUE, dl, Op.getValueType(),
3625                        Op.getOperand(1));
3626   case Intrinsic::aarch64_sve_clz:
3627     return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
3628                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3629   case Intrinsic::aarch64_sve_cnt: {
3630     SDValue Data = Op.getOperand(3);
3631     // CTPOP only supports integer operands.
3632     if (Data.getValueType().isFloatingPoint())
3633       Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
3634     return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
3635                        Op.getOperand(2), Data, Op.getOperand(1));
3636   }
3637   case Intrinsic::aarch64_sve_dupq_lane:
3638     return LowerDUPQLane(Op, DAG);
3639   case Intrinsic::aarch64_sve_convert_from_svbool:
3640     return DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, Op.getValueType(),
3641                        Op.getOperand(1));
3642   case Intrinsic::aarch64_sve_fneg:
3643     return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
3644                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3645   case Intrinsic::aarch64_sve_frintp:
3646     return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
3647                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3648   case Intrinsic::aarch64_sve_frintm:
3649     return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
3650                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3651   case Intrinsic::aarch64_sve_frinti:
3652     return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
3653                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3654   case Intrinsic::aarch64_sve_frintx:
3655     return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
3656                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3657   case Intrinsic::aarch64_sve_frinta:
3658     return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
3659                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3660   case Intrinsic::aarch64_sve_frintn:
3661     return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
3662                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3663   case Intrinsic::aarch64_sve_frintz:
3664     return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
3665                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3666   case Intrinsic::aarch64_sve_ucvtf:
3667     return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, dl,
3668                        Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
3669                        Op.getOperand(1));
3670   case Intrinsic::aarch64_sve_scvtf:
3671     return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, dl,
3672                        Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
3673                        Op.getOperand(1));
3674   case Intrinsic::aarch64_sve_fcvtzu:
3675     return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, dl,
3676                        Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
3677                        Op.getOperand(1));
3678   case Intrinsic::aarch64_sve_fcvtzs:
3679     return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, dl,
3680                        Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
3681                        Op.getOperand(1));
3682   case Intrinsic::aarch64_sve_fsqrt:
3683     return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
3684                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3685   case Intrinsic::aarch64_sve_frecpx:
3686     return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
3687                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3688   case Intrinsic::aarch64_sve_fabs:
3689     return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
3690                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3691   case Intrinsic::aarch64_sve_abs:
3692     return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
3693                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3694   case Intrinsic::aarch64_sve_neg:
3695     return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
3696                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3697   case Intrinsic::aarch64_sve_convert_to_svbool: {
3698     EVT OutVT = Op.getValueType();
3699     EVT InVT = Op.getOperand(1).getValueType();
3700     // Return the operand if the cast isn't changing type,
3701     // i.e. <n x 16 x i1> -> <n x 16 x i1>
3702     if (InVT == OutVT)
3703       return Op.getOperand(1);
3704     // Otherwise, zero the newly introduced lanes.
3705     SDValue Reinterpret =
3706         DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, OutVT, Op.getOperand(1));
3707     SDValue Mask = getPTrue(DAG, dl, InVT, AArch64SVEPredPattern::all);
3708     SDValue MaskReinterpret =
3709         DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, OutVT, Mask);
3710     return DAG.getNode(ISD::AND, dl, OutVT, Reinterpret, MaskReinterpret);
3711   }
3712 
3713   case Intrinsic::aarch64_sve_insr: {
3714     SDValue Scalar = Op.getOperand(2);
3715     EVT ScalarTy = Scalar.getValueType();
3716     if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
3717       Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
3718 
3719     return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
3720                        Op.getOperand(1), Scalar);
3721   }
3722   case Intrinsic::aarch64_sve_rbit:
3723     return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, dl,
3724                        Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
3725                        Op.getOperand(1));
3726   case Intrinsic::aarch64_sve_revb:
3727     return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
3728                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
3729   case Intrinsic::aarch64_sve_sxtb:
3730     return DAG.getNode(
3731         AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
3732         Op.getOperand(2), Op.getOperand(3),
3733         DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
3734         Op.getOperand(1));
3735   case Intrinsic::aarch64_sve_sxth:
3736     return DAG.getNode(
3737         AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
3738         Op.getOperand(2), Op.getOperand(3),
3739         DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
3740         Op.getOperand(1));
3741   case Intrinsic::aarch64_sve_sxtw:
3742     return DAG.getNode(
3743         AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
3744         Op.getOperand(2), Op.getOperand(3),
3745         DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
3746         Op.getOperand(1));
3747   case Intrinsic::aarch64_sve_uxtb:
3748     return DAG.getNode(
3749         AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
3750         Op.getOperand(2), Op.getOperand(3),
3751         DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
3752         Op.getOperand(1));
3753   case Intrinsic::aarch64_sve_uxth:
3754     return DAG.getNode(
3755         AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
3756         Op.getOperand(2), Op.getOperand(3),
3757         DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
3758         Op.getOperand(1));
3759   case Intrinsic::aarch64_sve_uxtw:
3760     return DAG.getNode(
3761         AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
3762         Op.getOperand(2), Op.getOperand(3),
3763         DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
3764         Op.getOperand(1));
3765 
3766   case Intrinsic::localaddress: {
3767     const auto &MF = DAG.getMachineFunction();
3768     const auto *RegInfo = Subtarget->getRegisterInfo();
3769     unsigned Reg = RegInfo->getLocalAddressRegister(MF);
3770     return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
3771                               Op.getSimpleValueType());
3772   }
3773 
3774   case Intrinsic::eh_recoverfp: {
3775     // FIXME: This needs to be implemented to correctly handle highly aligned
3776     // stack objects. For now we simply return the incoming FP. Refer D53541
3777     // for more details.
3778     SDValue FnOp = Op.getOperand(1);
3779     SDValue IncomingFPOp = Op.getOperand(2);
3780     GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
3781     auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
3782     if (!Fn)
3783       report_fatal_error(
3784           "llvm.eh.recoverfp must take a function as the first argument");
3785     return IncomingFPOp;
3786   }
3787 
3788   case Intrinsic::aarch64_neon_vsri:
3789   case Intrinsic::aarch64_neon_vsli: {
3790     EVT Ty = Op.getValueType();
3791 
3792     if (!Ty.isVector())
3793       report_fatal_error("Unexpected type for aarch64_neon_vsli");
3794 
3795     assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
3796 
3797     bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri;
3798     unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
3799     return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
3800                        Op.getOperand(3));
3801   }
3802 
3803   case Intrinsic::aarch64_neon_srhadd:
3804   case Intrinsic::aarch64_neon_urhadd:
3805   case Intrinsic::aarch64_neon_shadd:
3806   case Intrinsic::aarch64_neon_uhadd: {
3807     bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
3808                         IntNo == Intrinsic::aarch64_neon_shadd);
3809     bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
3810                           IntNo == Intrinsic::aarch64_neon_urhadd);
3811     unsigned Opcode =
3812         IsSignedAdd ? (IsRoundingAdd ? AArch64ISD::SRHADD : AArch64ISD::SHADD)
3813                     : (IsRoundingAdd ? AArch64ISD::URHADD : AArch64ISD::UHADD);
3814     return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
3815                        Op.getOperand(2));
3816   }
3817 
3818   case Intrinsic::aarch64_neon_uabd: {
3819     return DAG.getNode(AArch64ISD::UABD, dl, Op.getValueType(),
3820                        Op.getOperand(1), Op.getOperand(2));
3821   }
3822   case Intrinsic::aarch64_neon_sabd: {
3823     return DAG.getNode(AArch64ISD::SABD, dl, Op.getValueType(),
3824                        Op.getOperand(1), Op.getOperand(2));
3825   }
3826   }
3827 }
3828 
3829 bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const {
3830   if (VT.getVectorElementType() == MVT::i32 &&
3831       VT.getVectorElementCount().getKnownMinValue() >= 4)
3832     return true;
3833 
3834   return false;
3835 }
3836 
3837 bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
3838   return ExtVal.getValueType().isScalableVector();
3839 }
3840 
3841 unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
3842   std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
3843       {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
3844        AArch64ISD::GLD1_MERGE_ZERO},
3845       {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
3846        AArch64ISD::GLD1_UXTW_MERGE_ZERO},
3847       {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
3848        AArch64ISD::GLD1_MERGE_ZERO},
3849       {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
3850        AArch64ISD::GLD1_SXTW_MERGE_ZERO},
3851       {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
3852        AArch64ISD::GLD1_SCALED_MERGE_ZERO},
3853       {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
3854        AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
3855       {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
3856        AArch64ISD::GLD1_SCALED_MERGE_ZERO},
3857       {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
3858        AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
3859   };
3860   auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
3861   return AddrModes.find(Key)->second;
3862 }
3863 
3864 unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
3865   std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
3866       {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
3867        AArch64ISD::SST1_PRED},
3868       {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
3869        AArch64ISD::SST1_UXTW_PRED},
3870       {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
3871        AArch64ISD::SST1_PRED},
3872       {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
3873        AArch64ISD::SST1_SXTW_PRED},
3874       {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
3875        AArch64ISD::SST1_SCALED_PRED},
3876       {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
3877        AArch64ISD::SST1_UXTW_SCALED_PRED},
3878       {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
3879        AArch64ISD::SST1_SCALED_PRED},
3880       {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
3881        AArch64ISD::SST1_SXTW_SCALED_PRED},
3882   };
3883   auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
3884   return AddrModes.find(Key)->second;
3885 }
3886 
3887 unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
3888   switch (Opcode) {
3889   default:
3890     llvm_unreachable("unimplemented opcode");
3891     return Opcode;
3892   case AArch64ISD::GLD1_MERGE_ZERO:
3893     return AArch64ISD::GLD1S_MERGE_ZERO;
3894   case AArch64ISD::GLD1_IMM_MERGE_ZERO:
3895     return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
3896   case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
3897     return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
3898   case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
3899     return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
3900   case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
3901     return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
3902   case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
3903     return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
3904   case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
3905     return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
3906   }
3907 }
3908 
3909 bool getGatherScatterIndexIsExtended(SDValue Index) {
3910   unsigned Opcode = Index.getOpcode();
3911   if (Opcode == ISD::SIGN_EXTEND_INREG)
3912     return true;
3913 
3914   if (Opcode == ISD::AND) {
3915     SDValue Splat = Index.getOperand(1);
3916     if (Splat.getOpcode() != ISD::SPLAT_VECTOR)
3917       return false;
3918     ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Splat.getOperand(0));
3919     if (!Mask || Mask->getZExtValue() != 0xFFFFFFFF)
3920       return false;
3921     return true;
3922   }
3923 
3924   return false;
3925 }
3926 
3927 // If the base pointer of a masked gather or scatter is null, we
3928 // may be able to swap BasePtr & Index and use the vector + register
3929 // or vector + immediate addressing mode, e.g.
3930 // VECTOR + REGISTER:
3931 //    getelementptr nullptr, <vscale x N x T> (splat(%offset)) + %indices)
3932 // -> getelementptr %offset, <vscale x N x T> %indices
3933 // VECTOR + IMMEDIATE:
3934 //    getelementptr nullptr, <vscale x N x T> (splat(#x)) + %indices)
3935 // -> getelementptr #x, <vscale x N x T> %indices
3936 void selectGatherScatterAddrMode(SDValue &BasePtr, SDValue &Index, EVT MemVT,
3937                                  unsigned &Opcode, bool IsGather,
3938                                  SelectionDAG &DAG) {
3939   if (!isNullConstant(BasePtr))
3940     return;
3941 
3942   ConstantSDNode *Offset = nullptr;
3943   if (Index.getOpcode() == ISD::ADD)
3944     if (auto SplatVal = DAG.getSplatValue(Index.getOperand(1))) {
3945       if (isa<ConstantSDNode>(SplatVal))
3946         Offset = cast<ConstantSDNode>(SplatVal);
3947       else {
3948         BasePtr = SplatVal;
3949         Index = Index->getOperand(0);
3950         return;
3951       }
3952     }
3953 
3954   unsigned NewOp =
3955       IsGather ? AArch64ISD::GLD1_IMM_MERGE_ZERO : AArch64ISD::SST1_IMM_PRED;
3956 
3957   if (!Offset) {
3958     std::swap(BasePtr, Index);
3959     Opcode = NewOp;
3960     return;
3961   }
3962 
3963   uint64_t OffsetVal = Offset->getZExtValue();
3964   unsigned ScalarSizeInBytes = MemVT.getScalarSizeInBits() / 8;
3965   auto ConstOffset = DAG.getConstant(OffsetVal, SDLoc(Index), MVT::i64);
3966 
3967   if (OffsetVal % ScalarSizeInBytes || OffsetVal / ScalarSizeInBytes > 31) {
3968     // Index is out of range for the immediate addressing mode
3969     BasePtr = ConstOffset;
3970     Index = Index->getOperand(0);
3971     return;
3972   }
3973 
3974   // Immediate is in range
3975   Opcode = NewOp;
3976   BasePtr = Index->getOperand(0);
3977   Index = ConstOffset;
3978 }
3979 
3980 SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
3981                                             SelectionDAG &DAG) const {
3982   SDLoc DL(Op);
3983   MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
3984   assert(MGT && "Can only custom lower gather load nodes");
3985 
3986   SDValue Index = MGT->getIndex();
3987   SDValue Chain = MGT->getChain();
3988   SDValue PassThru = MGT->getPassThru();
3989   SDValue Mask = MGT->getMask();
3990   SDValue BasePtr = MGT->getBasePtr();
3991   ISD::LoadExtType ExtTy = MGT->getExtensionType();
3992 
3993   ISD::MemIndexType IndexType = MGT->getIndexType();
3994   bool IsScaled =
3995       IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED;
3996   bool IsSigned =
3997       IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED;
3998   bool IdxNeedsExtend =
3999       getGatherScatterIndexIsExtended(Index) ||
4000       Index.getSimpleValueType().getVectorElementType() == MVT::i32;
4001   bool ResNeedsSignExtend = ExtTy == ISD::EXTLOAD || ExtTy == ISD::SEXTLOAD;
4002 
4003   EVT VT = PassThru.getSimpleValueType();
4004   EVT MemVT = MGT->getMemoryVT();
4005   SDValue InputVT = DAG.getValueType(MemVT);
4006 
4007   if (VT.getVectorElementType() == MVT::bf16 &&
4008       !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
4009     return SDValue();
4010 
4011   // Handle FP data by using an integer gather and casting the result.
4012   if (VT.isFloatingPoint()) {
4013     EVT PassThruVT = getPackedSVEVectorVT(VT.getVectorElementCount());
4014     PassThru = getSVESafeBitCast(PassThruVT, PassThru, DAG);
4015     InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
4016   }
4017 
4018   SDVTList VTs = DAG.getVTList(PassThru.getSimpleValueType(), MVT::Other);
4019 
4020   if (getGatherScatterIndexIsExtended(Index))
4021     Index = Index.getOperand(0);
4022 
4023   unsigned Opcode = getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend);
4024   selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode,
4025                               /*isGather=*/true, DAG);
4026 
4027   if (ResNeedsSignExtend)
4028     Opcode = getSignExtendedGatherOpcode(Opcode);
4029 
4030   SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT, PassThru};
4031   SDValue Gather = DAG.getNode(Opcode, DL, VTs, Ops);
4032 
4033   if (VT.isFloatingPoint()) {
4034     SDValue Cast = getSVESafeBitCast(VT, Gather, DAG);
4035     return DAG.getMergeValues({Cast, Gather}, DL);
4036   }
4037 
4038   return Gather;
4039 }
4040 
4041 SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
4042                                              SelectionDAG &DAG) const {
4043   SDLoc DL(Op);
4044   MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
4045   assert(MSC && "Can only custom lower scatter store nodes");
4046 
4047   SDValue Index = MSC->getIndex();
4048   SDValue Chain = MSC->getChain();
4049   SDValue StoreVal = MSC->getValue();
4050   SDValue Mask = MSC->getMask();
4051   SDValue BasePtr = MSC->getBasePtr();
4052 
4053   ISD::MemIndexType IndexType = MSC->getIndexType();
4054   bool IsScaled =
4055       IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED;
4056   bool IsSigned =
4057       IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED;
4058   bool NeedsExtend =
4059       getGatherScatterIndexIsExtended(Index) ||
4060       Index.getSimpleValueType().getVectorElementType() == MVT::i32;
4061 
4062   EVT VT = StoreVal.getSimpleValueType();
4063   SDVTList VTs = DAG.getVTList(MVT::Other);
4064   EVT MemVT = MSC->getMemoryVT();
4065   SDValue InputVT = DAG.getValueType(MemVT);
4066 
4067   if (VT.getVectorElementType() == MVT::bf16 &&
4068       !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
4069     return SDValue();
4070 
4071   // Handle FP data by casting the data so an integer scatter can be used.
4072   if (VT.isFloatingPoint()) {
4073     EVT StoreValVT = getPackedSVEVectorVT(VT.getVectorElementCount());
4074     StoreVal = getSVESafeBitCast(StoreValVT, StoreVal, DAG);
4075     InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
4076   }
4077 
4078   if (getGatherScatterIndexIsExtended(Index))
4079     Index = Index.getOperand(0);
4080 
4081   unsigned Opcode = getScatterVecOpcode(IsScaled, IsSigned, NeedsExtend);
4082   selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode,
4083                               /*isGather=*/false, DAG);
4084 
4085   SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, InputVT};
4086   return DAG.getNode(Opcode, DL, VTs, Ops);
4087 }
4088 
4089 // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
4090 static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
4091                                         EVT VT, EVT MemVT,
4092                                         SelectionDAG &DAG) {
4093   assert(VT.isVector() && "VT should be a vector type");
4094   assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
4095 
4096   SDValue Value = ST->getValue();
4097 
4098   // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
4099   // the word lane which represent the v4i8 subvector.  It optimizes the store
4100   // to:
4101   //
4102   //   xtn  v0.8b, v0.8h
4103   //   str  s0, [x0]
4104 
4105   SDValue Undef = DAG.getUNDEF(MVT::i16);
4106   SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
4107                                         {Undef, Undef, Undef, Undef});
4108 
4109   SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
4110                                  Value, UndefVec);
4111   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
4112 
4113   Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
4114   SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
4115                                      Trunc, DAG.getConstant(0, DL, MVT::i64));
4116 
4117   return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
4118                       ST->getBasePtr(), ST->getMemOperand());
4119 }
4120 
4121 // Custom lowering for any store, vector or scalar and/or default or with
4122 // a truncate operations.  Currently only custom lower truncate operation
4123 // from vector v4i16 to v4i8 or volatile stores of i128.
4124 SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
4125                                           SelectionDAG &DAG) const {
4126   SDLoc Dl(Op);
4127   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
4128   assert (StoreNode && "Can only custom lower store nodes");
4129 
4130   SDValue Value = StoreNode->getValue();
4131 
4132   EVT VT = Value.getValueType();
4133   EVT MemVT = StoreNode->getMemoryVT();
4134 
4135   if (VT.isVector()) {
4136     if (useSVEForFixedLengthVectorVT(VT))
4137       return LowerFixedLengthVectorStoreToSVE(Op, DAG);
4138 
4139     unsigned AS = StoreNode->getAddressSpace();
4140     Align Alignment = StoreNode->getAlign();
4141     if (Alignment < MemVT.getStoreSize() &&
4142         !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment.value(),
4143                                         StoreNode->getMemOperand()->getFlags(),
4144                                         nullptr)) {
4145       return scalarizeVectorStore(StoreNode, DAG);
4146     }
4147 
4148     if (StoreNode->isTruncatingStore()) {
4149       return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
4150     }
4151     // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
4152     // the custom lowering, as there are no un-paired non-temporal stores and
4153     // legalization will break up 256 bit inputs.
4154     ElementCount EC = MemVT.getVectorElementCount();
4155     if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
4156         EC.isKnownEven() &&
4157         ((MemVT.getScalarSizeInBits() == 8u ||
4158           MemVT.getScalarSizeInBits() == 16u ||
4159           MemVT.getScalarSizeInBits() == 32u ||
4160           MemVT.getScalarSizeInBits() == 64u))) {
4161       SDValue Lo =
4162           DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
4163                       MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
4164                       StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
4165       SDValue Hi =
4166           DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
4167                       MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
4168                       StoreNode->getValue(),
4169                       DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
4170       SDValue Result = DAG.getMemIntrinsicNode(
4171           AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
4172           {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
4173           StoreNode->getMemoryVT(), StoreNode->getMemOperand());
4174       return Result;
4175     }
4176   } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
4177     assert(StoreNode->getValue()->getValueType(0) == MVT::i128);
4178     SDValue Lo =
4179         DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),
4180                     DAG.getConstant(0, Dl, MVT::i64));
4181     SDValue Hi =
4182         DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),
4183                     DAG.getConstant(1, Dl, MVT::i64));
4184     SDValue Result = DAG.getMemIntrinsicNode(
4185         AArch64ISD::STP, Dl, DAG.getVTList(MVT::Other),
4186         {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
4187         StoreNode->getMemoryVT(), StoreNode->getMemOperand());
4188     return Result;
4189   }
4190 
4191   return SDValue();
4192 }
4193 
4194 // Generate SUBS and CSEL for integer abs.
4195 SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
4196   MVT VT = Op.getSimpleValueType();
4197 
4198   if (VT.isVector())
4199     return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
4200 
4201   SDLoc DL(Op);
4202   SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
4203                             Op.getOperand(0));
4204   // Generate SUBS & CSEL.
4205   SDValue Cmp =
4206       DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
4207                   Op.getOperand(0), DAG.getConstant(0, DL, VT));
4208   return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
4209                      DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
4210                      Cmp.getValue(1));
4211 }
4212 
4213 SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
4214                                               SelectionDAG &DAG) const {
4215   LLVM_DEBUG(dbgs() << "Custom lowering: ");
4216   LLVM_DEBUG(Op.dump());
4217 
4218   switch (Op.getOpcode()) {
4219   default:
4220     llvm_unreachable("unimplemented operand");
4221     return SDValue();
4222   case ISD::BITCAST:
4223     return LowerBITCAST(Op, DAG);
4224   case ISD::GlobalAddress:
4225     return LowerGlobalAddress(Op, DAG);
4226   case ISD::GlobalTLSAddress:
4227     return LowerGlobalTLSAddress(Op, DAG);
4228   case ISD::SETCC:
4229   case ISD::STRICT_FSETCC:
4230   case ISD::STRICT_FSETCCS:
4231     return LowerSETCC(Op, DAG);
4232   case ISD::BR_CC:
4233     return LowerBR_CC(Op, DAG);
4234   case ISD::SELECT:
4235     return LowerSELECT(Op, DAG);
4236   case ISD::SELECT_CC:
4237     return LowerSELECT_CC(Op, DAG);
4238   case ISD::JumpTable:
4239     return LowerJumpTable(Op, DAG);
4240   case ISD::BR_JT:
4241     return LowerBR_JT(Op, DAG);
4242   case ISD::ConstantPool:
4243     return LowerConstantPool(Op, DAG);
4244   case ISD::BlockAddress:
4245     return LowerBlockAddress(Op, DAG);
4246   case ISD::VASTART:
4247     return LowerVASTART(Op, DAG);
4248   case ISD::VACOPY:
4249     return LowerVACOPY(Op, DAG);
4250   case ISD::VAARG:
4251     return LowerVAARG(Op, DAG);
4252   case ISD::ADDC:
4253   case ISD::ADDE:
4254   case ISD::SUBC:
4255   case ISD::SUBE:
4256     return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
4257   case ISD::SADDO:
4258   case ISD::UADDO:
4259   case ISD::SSUBO:
4260   case ISD::USUBO:
4261   case ISD::SMULO:
4262   case ISD::UMULO:
4263     return LowerXALUO(Op, DAG);
4264   case ISD::FADD:
4265     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
4266   case ISD::FSUB:
4267     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
4268   case ISD::FMUL:
4269     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
4270   case ISD::FMA:
4271     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
4272   case ISD::FDIV:
4273     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
4274   case ISD::FNEG:
4275     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
4276   case ISD::FCEIL:
4277     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
4278   case ISD::FFLOOR:
4279     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
4280   case ISD::FNEARBYINT:
4281     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
4282   case ISD::FRINT:
4283     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
4284   case ISD::FROUND:
4285     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
4286   case ISD::FROUNDEVEN:
4287     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
4288   case ISD::FTRUNC:
4289     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
4290   case ISD::FSQRT:
4291     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
4292   case ISD::FABS:
4293     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
4294   case ISD::FP_ROUND:
4295   case ISD::STRICT_FP_ROUND:
4296     return LowerFP_ROUND(Op, DAG);
4297   case ISD::FP_EXTEND:
4298     return LowerFP_EXTEND(Op, DAG);
4299   case ISD::FRAMEADDR:
4300     return LowerFRAMEADDR(Op, DAG);
4301   case ISD::SPONENTRY:
4302     return LowerSPONENTRY(Op, DAG);
4303   case ISD::RETURNADDR:
4304     return LowerRETURNADDR(Op, DAG);
4305   case ISD::ADDROFRETURNADDR:
4306     return LowerADDROFRETURNADDR(Op, DAG);
4307   case ISD::CONCAT_VECTORS:
4308     return LowerCONCAT_VECTORS(Op, DAG);
4309   case ISD::INSERT_VECTOR_ELT:
4310     return LowerINSERT_VECTOR_ELT(Op, DAG);
4311   case ISD::EXTRACT_VECTOR_ELT:
4312     return LowerEXTRACT_VECTOR_ELT(Op, DAG);
4313   case ISD::BUILD_VECTOR:
4314     return LowerBUILD_VECTOR(Op, DAG);
4315   case ISD::VECTOR_SHUFFLE:
4316     return LowerVECTOR_SHUFFLE(Op, DAG);
4317   case ISD::SPLAT_VECTOR:
4318     return LowerSPLAT_VECTOR(Op, DAG);
4319   case ISD::EXTRACT_SUBVECTOR:
4320     return LowerEXTRACT_SUBVECTOR(Op, DAG);
4321   case ISD::INSERT_SUBVECTOR:
4322     return LowerINSERT_SUBVECTOR(Op, DAG);
4323   case ISD::SDIV:
4324   case ISD::UDIV:
4325     return LowerDIV(Op, DAG);
4326   case ISD::SMIN:
4327     return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED,
4328                                /*OverrideNEON=*/true);
4329   case ISD::UMIN:
4330     return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED,
4331                                /*OverrideNEON=*/true);
4332   case ISD::SMAX:
4333     return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED,
4334                                /*OverrideNEON=*/true);
4335   case ISD::UMAX:
4336     return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED,
4337                                /*OverrideNEON=*/true);
4338   case ISD::SRA:
4339   case ISD::SRL:
4340   case ISD::SHL:
4341     return LowerVectorSRA_SRL_SHL(Op, DAG);
4342   case ISD::SHL_PARTS:
4343     return LowerShiftLeftParts(Op, DAG);
4344   case ISD::SRL_PARTS:
4345   case ISD::SRA_PARTS:
4346     return LowerShiftRightParts(Op, DAG);
4347   case ISD::CTPOP:
4348     return LowerCTPOP(Op, DAG);
4349   case ISD::FCOPYSIGN:
4350     return LowerFCOPYSIGN(Op, DAG);
4351   case ISD::OR:
4352     return LowerVectorOR(Op, DAG);
4353   case ISD::XOR:
4354     return LowerXOR(Op, DAG);
4355   case ISD::PREFETCH:
4356     return LowerPREFETCH(Op, DAG);
4357   case ISD::SINT_TO_FP:
4358   case ISD::UINT_TO_FP:
4359   case ISD::STRICT_SINT_TO_FP:
4360   case ISD::STRICT_UINT_TO_FP:
4361     return LowerINT_TO_FP(Op, DAG);
4362   case ISD::FP_TO_SINT:
4363   case ISD::FP_TO_UINT:
4364   case ISD::STRICT_FP_TO_SINT:
4365   case ISD::STRICT_FP_TO_UINT:
4366     return LowerFP_TO_INT(Op, DAG);
4367   case ISD::FSINCOS:
4368     return LowerFSINCOS(Op, DAG);
4369   case ISD::FLT_ROUNDS_:
4370     return LowerFLT_ROUNDS_(Op, DAG);
4371   case ISD::MUL:
4372     return LowerMUL(Op, DAG);
4373   case ISD::INTRINSIC_WO_CHAIN:
4374     return LowerINTRINSIC_WO_CHAIN(Op, DAG);
4375   case ISD::STORE:
4376     return LowerSTORE(Op, DAG);
4377   case ISD::MGATHER:
4378     return LowerMGATHER(Op, DAG);
4379   case ISD::MSCATTER:
4380     return LowerMSCATTER(Op, DAG);
4381   case ISD::VECREDUCE_SEQ_FADD:
4382     return LowerVECREDUCE_SEQ_FADD(Op, DAG);
4383   case ISD::VECREDUCE_ADD:
4384   case ISD::VECREDUCE_AND:
4385   case ISD::VECREDUCE_OR:
4386   case ISD::VECREDUCE_XOR:
4387   case ISD::VECREDUCE_SMAX:
4388   case ISD::VECREDUCE_SMIN:
4389   case ISD::VECREDUCE_UMAX:
4390   case ISD::VECREDUCE_UMIN:
4391   case ISD::VECREDUCE_FADD:
4392   case ISD::VECREDUCE_FMAX:
4393   case ISD::VECREDUCE_FMIN:
4394     return LowerVECREDUCE(Op, DAG);
4395   case ISD::ATOMIC_LOAD_SUB:
4396     return LowerATOMIC_LOAD_SUB(Op, DAG);
4397   case ISD::ATOMIC_LOAD_AND:
4398     return LowerATOMIC_LOAD_AND(Op, DAG);
4399   case ISD::DYNAMIC_STACKALLOC:
4400     return LowerDYNAMIC_STACKALLOC(Op, DAG);
4401   case ISD::VSCALE:
4402     return LowerVSCALE(Op, DAG);
4403   case ISD::ANY_EXTEND:
4404   case ISD::SIGN_EXTEND:
4405   case ISD::ZERO_EXTEND:
4406     return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
4407   case ISD::SIGN_EXTEND_INREG: {
4408     // Only custom lower when ExtraVT has a legal byte based element type.
4409     EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4410     EVT ExtraEltVT = ExtraVT.getVectorElementType();
4411     if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
4412         (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
4413       return SDValue();
4414 
4415     return LowerToPredicatedOp(Op, DAG,
4416                                AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
4417   }
4418   case ISD::TRUNCATE:
4419     return LowerTRUNCATE(Op, DAG);
4420   case ISD::LOAD:
4421     if (useSVEForFixedLengthVectorVT(Op.getValueType()))
4422       return LowerFixedLengthVectorLoadToSVE(Op, DAG);
4423     llvm_unreachable("Unexpected request to lower ISD::LOAD");
4424   case ISD::ADD:
4425     return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED);
4426   case ISD::AND:
4427     return LowerToScalableOp(Op, DAG);
4428   case ISD::SUB:
4429     return LowerToPredicatedOp(Op, DAG, AArch64ISD::SUB_PRED);
4430   case ISD::FMAXNUM:
4431     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
4432   case ISD::FMINNUM:
4433     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
4434   case ISD::VSELECT:
4435     return LowerFixedLengthVectorSelectToSVE(Op, DAG);
4436   case ISD::ABS:
4437     return LowerABS(Op, DAG);
4438   case ISD::BITREVERSE:
4439     return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU,
4440                                /*OverrideNEON=*/true);
4441   case ISD::BSWAP:
4442     return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
4443   case ISD::CTLZ:
4444     return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU,
4445                                /*OverrideNEON=*/true);
4446   case ISD::CTTZ:
4447     return LowerCTTZ(Op, DAG);
4448   }
4449 }
4450 
4451 bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const {
4452   return !Subtarget->useSVEForFixedLengthVectors();
4453 }
4454 
4455 bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
4456     EVT VT, bool OverrideNEON) const {
4457   if (!Subtarget->useSVEForFixedLengthVectors())
4458     return false;
4459 
4460   if (!VT.isFixedLengthVector())
4461     return false;
4462 
4463   // Don't use SVE for vectors we cannot scalarize if required.
4464   switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
4465   // Fixed length predicates should be promoted to i8.
4466   // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
4467   case MVT::i1:
4468   default:
4469     return false;
4470   case MVT::i8:
4471   case MVT::i16:
4472   case MVT::i32:
4473   case MVT::i64:
4474   case MVT::f16:
4475   case MVT::f32:
4476   case MVT::f64:
4477     break;
4478   }
4479 
4480   // All SVE implementations support NEON sized vectors.
4481   if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
4482     return true;
4483 
4484   // Ensure NEON MVTs only belong to a single register class.
4485   if (VT.getFixedSizeInBits() <= 128)
4486     return false;
4487 
4488   // Don't use SVE for types that don't fit.
4489   if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
4490     return false;
4491 
4492   // TODO: Perhaps an artificial restriction, but worth having whilst getting
4493   // the base fixed length SVE support in place.
4494   if (!VT.isPow2VectorType())
4495     return false;
4496 
4497   return true;
4498 }
4499 
4500 //===----------------------------------------------------------------------===//
4501 //                      Calling Convention Implementation
4502 //===----------------------------------------------------------------------===//
4503 
4504 /// Selects the correct CCAssignFn for a given CallingConvention value.
4505 CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
4506                                                      bool IsVarArg) const {
4507   switch (CC) {
4508   default:
4509     report_fatal_error("Unsupported calling convention.");
4510   case CallingConv::WebKit_JS:
4511     return CC_AArch64_WebKit_JS;
4512   case CallingConv::GHC:
4513     return CC_AArch64_GHC;
4514   case CallingConv::C:
4515   case CallingConv::Fast:
4516   case CallingConv::PreserveMost:
4517   case CallingConv::CXX_FAST_TLS:
4518   case CallingConv::Swift:
4519     if (Subtarget->isTargetWindows() && IsVarArg)
4520       return CC_AArch64_Win64_VarArg;
4521     if (!Subtarget->isTargetDarwin())
4522       return CC_AArch64_AAPCS;
4523     if (!IsVarArg)
4524       return CC_AArch64_DarwinPCS;
4525     return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
4526                                       : CC_AArch64_DarwinPCS_VarArg;
4527    case CallingConv::Win64:
4528     return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
4529    case CallingConv::CFGuard_Check:
4530      return CC_AArch64_Win64_CFGuard_Check;
4531    case CallingConv::AArch64_VectorCall:
4532    case CallingConv::AArch64_SVE_VectorCall:
4533      return CC_AArch64_AAPCS;
4534   }
4535 }
4536 
4537 CCAssignFn *
4538 AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
4539   return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
4540                                       : RetCC_AArch64_AAPCS;
4541 }
4542 
4543 SDValue AArch64TargetLowering::LowerFormalArguments(
4544     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4545     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
4546     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4547   MachineFunction &MF = DAG.getMachineFunction();
4548   MachineFrameInfo &MFI = MF.getFrameInfo();
4549   bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
4550 
4551   // Assign locations to all of the incoming arguments.
4552   SmallVector<CCValAssign, 16> ArgLocs;
4553   DenseMap<unsigned, SDValue> CopiedRegs;
4554   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
4555                  *DAG.getContext());
4556 
4557   // At this point, Ins[].VT may already be promoted to i32. To correctly
4558   // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
4559   // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
4560   // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
4561   // we use a special version of AnalyzeFormalArguments to pass in ValVT and
4562   // LocVT.
4563   unsigned NumArgs = Ins.size();
4564   Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin();
4565   unsigned CurArgIdx = 0;
4566   for (unsigned i = 0; i != NumArgs; ++i) {
4567     MVT ValVT = Ins[i].VT;
4568     if (Ins[i].isOrigArg()) {
4569       std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
4570       CurArgIdx = Ins[i].getOrigArgIndex();
4571 
4572       // Get type of the original argument.
4573       EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
4574                                   /*AllowUnknown*/ true);
4575       MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
4576       // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
4577       if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
4578         ValVT = MVT::i8;
4579       else if (ActualMVT == MVT::i16)
4580         ValVT = MVT::i16;
4581     }
4582     CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
4583     bool Res =
4584         AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
4585     assert(!Res && "Call operand has unhandled type");
4586     (void)Res;
4587   }
4588   SmallVector<SDValue, 16> ArgValues;
4589   unsigned ExtraArgLocs = 0;
4590   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4591     CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
4592 
4593     if (Ins[i].Flags.isByVal()) {
4594       // Byval is used for HFAs in the PCS, but the system should work in a
4595       // non-compliant manner for larger structs.
4596       EVT PtrVT = getPointerTy(DAG.getDataLayout());
4597       int Size = Ins[i].Flags.getByValSize();
4598       unsigned NumRegs = (Size + 7) / 8;
4599 
4600       // FIXME: This works on big-endian for composite byvals, which are the common
4601       // case. It should also work for fundamental types too.
4602       unsigned FrameIdx =
4603         MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
4604       SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
4605       InVals.push_back(FrameIdxN);
4606 
4607       continue;
4608     }
4609 
4610     SDValue ArgValue;
4611     if (VA.isRegLoc()) {
4612       // Arguments stored in registers.
4613       EVT RegVT = VA.getLocVT();
4614       const TargetRegisterClass *RC;
4615 
4616       if (RegVT == MVT::i32)
4617         RC = &AArch64::GPR32RegClass;
4618       else if (RegVT == MVT::i64)
4619         RC = &AArch64::GPR64RegClass;
4620       else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
4621         RC = &AArch64::FPR16RegClass;
4622       else if (RegVT == MVT::f32)
4623         RC = &AArch64::FPR32RegClass;
4624       else if (RegVT == MVT::f64 || RegVT.is64BitVector())
4625         RC = &AArch64::FPR64RegClass;
4626       else if (RegVT == MVT::f128 || RegVT.is128BitVector())
4627         RC = &AArch64::FPR128RegClass;
4628       else if (RegVT.isScalableVector() &&
4629                RegVT.getVectorElementType() == MVT::i1)
4630         RC = &AArch64::PPRRegClass;
4631       else if (RegVT.isScalableVector())
4632         RC = &AArch64::ZPRRegClass;
4633       else
4634         llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
4635 
4636       // Transform the arguments in physical registers into virtual ones.
4637       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
4638       ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
4639 
4640       // If this is an 8, 16 or 32-bit value, it is really passed promoted
4641       // to 64 bits.  Insert an assert[sz]ext to capture this, then
4642       // truncate to the right size.
4643       switch (VA.getLocInfo()) {
4644       default:
4645         llvm_unreachable("Unknown loc info!");
4646       case CCValAssign::Full:
4647         break;
4648       case CCValAssign::Indirect:
4649         assert(VA.getValVT().isScalableVector() &&
4650                "Only scalable vectors can be passed indirectly");
4651         break;
4652       case CCValAssign::BCvt:
4653         ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
4654         break;
4655       case CCValAssign::AExt:
4656       case CCValAssign::SExt:
4657       case CCValAssign::ZExt:
4658         break;
4659       case CCValAssign::AExtUpper:
4660         ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
4661                                DAG.getConstant(32, DL, RegVT));
4662         ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
4663         break;
4664       }
4665     } else { // VA.isRegLoc()
4666       assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
4667       unsigned ArgOffset = VA.getLocMemOffset();
4668       unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
4669                               ? VA.getLocVT().getSizeInBits()
4670                               : VA.getValVT().getSizeInBits()) / 8;
4671 
4672       uint32_t BEAlign = 0;
4673       if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
4674           !Ins[i].Flags.isInConsecutiveRegs())
4675         BEAlign = 8 - ArgSize;
4676 
4677       int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
4678 
4679       // Create load nodes to retrieve arguments from the stack.
4680       SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4681 
4682       // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
4683       ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
4684       MVT MemVT = VA.getValVT();
4685 
4686       switch (VA.getLocInfo()) {
4687       default:
4688         break;
4689       case CCValAssign::Trunc:
4690       case CCValAssign::BCvt:
4691         MemVT = VA.getLocVT();
4692         break;
4693       case CCValAssign::Indirect:
4694         assert(VA.getValVT().isScalableVector() &&
4695                "Only scalable vectors can be passed indirectly");
4696         MemVT = VA.getLocVT();
4697         break;
4698       case CCValAssign::SExt:
4699         ExtType = ISD::SEXTLOAD;
4700         break;
4701       case CCValAssign::ZExt:
4702         ExtType = ISD::ZEXTLOAD;
4703         break;
4704       case CCValAssign::AExt:
4705         ExtType = ISD::EXTLOAD;
4706         break;
4707       }
4708 
4709       ArgValue = DAG.getExtLoad(
4710           ExtType, DL, VA.getLocVT(), Chain, FIN,
4711           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
4712           MemVT);
4713 
4714     }
4715 
4716     if (VA.getLocInfo() == CCValAssign::Indirect) {
4717       assert(VA.getValVT().isScalableVector() &&
4718            "Only scalable vectors can be passed indirectly");
4719 
4720       uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinSize();
4721       unsigned NumParts = 1;
4722       if (Ins[i].Flags.isInConsecutiveRegs()) {
4723         assert(!Ins[i].Flags.isInConsecutiveRegsLast());
4724         while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
4725           ++NumParts;
4726       }
4727 
4728       MVT PartLoad = VA.getValVT();
4729       SDValue Ptr = ArgValue;
4730 
4731       // Ensure we generate all loads for each tuple part, whilst updating the
4732       // pointer after each load correctly using vscale.
4733       while (NumParts > 0) {
4734         ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
4735         InVals.push_back(ArgValue);
4736         NumParts--;
4737         if (NumParts > 0) {
4738           SDValue BytesIncrement = DAG.getVScale(
4739               DL, Ptr.getValueType(),
4740               APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize));
4741           SDNodeFlags Flags;
4742           Flags.setNoUnsignedWrap(true);
4743           Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
4744                             BytesIncrement, Flags);
4745           ExtraArgLocs++;
4746           i++;
4747         }
4748       }
4749     } else {
4750       if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
4751         ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
4752                                ArgValue, DAG.getValueType(MVT::i32));
4753       InVals.push_back(ArgValue);
4754     }
4755   }
4756   assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
4757 
4758   // varargs
4759   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
4760   if (isVarArg) {
4761     if (!Subtarget->isTargetDarwin() || IsWin64) {
4762       // The AAPCS variadic function ABI is identical to the non-variadic
4763       // one. As a result there may be more arguments in registers and we should
4764       // save them for future reference.
4765       // Win64 variadic functions also pass arguments in registers, but all float
4766       // arguments are passed in integer registers.
4767       saveVarArgRegisters(CCInfo, DAG, DL, Chain);
4768     }
4769 
4770     // This will point to the next argument passed via stack.
4771     unsigned StackOffset = CCInfo.getNextStackOffset();
4772     // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
4773     StackOffset = alignTo(StackOffset, Subtarget->isTargetILP32() ? 4 : 8);
4774     FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
4775 
4776     if (MFI.hasMustTailInVarArgFunc()) {
4777       SmallVector<MVT, 2> RegParmTypes;
4778       RegParmTypes.push_back(MVT::i64);
4779       RegParmTypes.push_back(MVT::f128);
4780       // Compute the set of forwarded registers. The rest are scratch.
4781       SmallVectorImpl<ForwardedRegister> &Forwards =
4782                                        FuncInfo->getForwardedMustTailRegParms();
4783       CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
4784                                                CC_AArch64_AAPCS);
4785 
4786       // Conservatively forward X8, since it might be used for aggregate return.
4787       if (!CCInfo.isAllocated(AArch64::X8)) {
4788         unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
4789         Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
4790       }
4791     }
4792   }
4793 
4794   // On Windows, InReg pointers must be returned, so record the pointer in a
4795   // virtual register at the start of the function so it can be returned in the
4796   // epilogue.
4797   if (IsWin64) {
4798     for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
4799       if (Ins[I].Flags.isInReg()) {
4800         assert(!FuncInfo->getSRetReturnReg());
4801 
4802         MVT PtrTy = getPointerTy(DAG.getDataLayout());
4803         Register Reg =
4804             MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
4805         FuncInfo->setSRetReturnReg(Reg);
4806 
4807         SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
4808         Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
4809         break;
4810       }
4811     }
4812   }
4813 
4814   unsigned StackArgSize = CCInfo.getNextStackOffset();
4815   bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4816   if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
4817     // This is a non-standard ABI so by fiat I say we're allowed to make full
4818     // use of the stack area to be popped, which must be aligned to 16 bytes in
4819     // any case:
4820     StackArgSize = alignTo(StackArgSize, 16);
4821 
4822     // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
4823     // a multiple of 16.
4824     FuncInfo->setArgumentStackToRestore(StackArgSize);
4825 
4826     // This realignment carries over to the available bytes below. Our own
4827     // callers will guarantee the space is free by giving an aligned value to
4828     // CALLSEQ_START.
4829   }
4830   // Even if we're not expected to free up the space, it's useful to know how
4831   // much is there while considering tail calls (because we can reuse it).
4832   FuncInfo->setBytesInStackArgArea(StackArgSize);
4833 
4834   if (Subtarget->hasCustomCallingConv())
4835     Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
4836 
4837   return Chain;
4838 }
4839 
4840 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
4841                                                 SelectionDAG &DAG,
4842                                                 const SDLoc &DL,
4843                                                 SDValue &Chain) const {
4844   MachineFunction &MF = DAG.getMachineFunction();
4845   MachineFrameInfo &MFI = MF.getFrameInfo();
4846   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
4847   auto PtrVT = getPointerTy(DAG.getDataLayout());
4848   bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
4849 
4850   SmallVector<SDValue, 8> MemOps;
4851 
4852   static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
4853                                           AArch64::X3, AArch64::X4, AArch64::X5,
4854                                           AArch64::X6, AArch64::X7 };
4855   static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
4856   unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
4857 
4858   unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
4859   int GPRIdx = 0;
4860   if (GPRSaveSize != 0) {
4861     if (IsWin64) {
4862       GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
4863       if (GPRSaveSize & 15)
4864         // The extra size here, if triggered, will always be 8.
4865         MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
4866     } else
4867       GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
4868 
4869     SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
4870 
4871     for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
4872       unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
4873       SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
4874       SDValue Store = DAG.getStore(
4875           Val.getValue(1), DL, Val, FIN,
4876           IsWin64
4877               ? MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
4878                                                   GPRIdx,
4879                                                   (i - FirstVariadicGPR) * 8)
4880               : MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
4881       MemOps.push_back(Store);
4882       FIN =
4883           DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
4884     }
4885   }
4886   FuncInfo->setVarArgsGPRIndex(GPRIdx);
4887   FuncInfo->setVarArgsGPRSize(GPRSaveSize);
4888 
4889   if (Subtarget->hasFPARMv8() && !IsWin64) {
4890     static const MCPhysReg FPRArgRegs[] = {
4891         AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
4892         AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
4893     static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
4894     unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
4895 
4896     unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
4897     int FPRIdx = 0;
4898     if (FPRSaveSize != 0) {
4899       FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
4900 
4901       SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
4902 
4903       for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
4904         unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
4905         SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
4906 
4907         SDValue Store = DAG.getStore(
4908             Val.getValue(1), DL, Val, FIN,
4909             MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16));
4910         MemOps.push_back(Store);
4911         FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
4912                           DAG.getConstant(16, DL, PtrVT));
4913       }
4914     }
4915     FuncInfo->setVarArgsFPRIndex(FPRIdx);
4916     FuncInfo->setVarArgsFPRSize(FPRSaveSize);
4917   }
4918 
4919   if (!MemOps.empty()) {
4920     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
4921   }
4922 }
4923 
4924 /// LowerCallResult - Lower the result values of a call into the
4925 /// appropriate copies out of appropriate physical registers.
4926 SDValue AArch64TargetLowering::LowerCallResult(
4927     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
4928     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
4929     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
4930     SDValue ThisVal) const {
4931   CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
4932   // Assign locations to each value returned by this call.
4933   SmallVector<CCValAssign, 16> RVLocs;
4934   DenseMap<unsigned, SDValue> CopiedRegs;
4935   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
4936                  *DAG.getContext());
4937   CCInfo.AnalyzeCallResult(Ins, RetCC);
4938 
4939   // Copy all of the result registers out of their specified physreg.
4940   for (unsigned i = 0; i != RVLocs.size(); ++i) {
4941     CCValAssign VA = RVLocs[i];
4942 
4943     // Pass 'this' value directly from the argument to return value, to avoid
4944     // reg unit interference
4945     if (i == 0 && isThisReturn) {
4946       assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
4947              "unexpected return calling convention register assignment");
4948       InVals.push_back(ThisVal);
4949       continue;
4950     }
4951 
4952     // Avoid copying a physreg twice since RegAllocFast is incompetent and only
4953     // allows one use of a physreg per block.
4954     SDValue Val = CopiedRegs.lookup(VA.getLocReg());
4955     if (!Val) {
4956       Val =
4957           DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
4958       Chain = Val.getValue(1);
4959       InFlag = Val.getValue(2);
4960       CopiedRegs[VA.getLocReg()] = Val;
4961     }
4962 
4963     switch (VA.getLocInfo()) {
4964     default:
4965       llvm_unreachable("Unknown loc info!");
4966     case CCValAssign::Full:
4967       break;
4968     case CCValAssign::BCvt:
4969       Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
4970       break;
4971     case CCValAssign::AExtUpper:
4972       Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
4973                         DAG.getConstant(32, DL, VA.getLocVT()));
4974       LLVM_FALLTHROUGH;
4975     case CCValAssign::AExt:
4976       LLVM_FALLTHROUGH;
4977     case CCValAssign::ZExt:
4978       Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
4979       break;
4980     }
4981 
4982     InVals.push_back(Val);
4983   }
4984 
4985   return Chain;
4986 }
4987 
4988 /// Return true if the calling convention is one that we can guarantee TCO for.
4989 static bool canGuaranteeTCO(CallingConv::ID CC) {
4990   return CC == CallingConv::Fast;
4991 }
4992 
4993 /// Return true if we might ever do TCO for calls with this calling convention.
4994 static bool mayTailCallThisCC(CallingConv::ID CC) {
4995   switch (CC) {
4996   case CallingConv::C:
4997   case CallingConv::AArch64_SVE_VectorCall:
4998   case CallingConv::PreserveMost:
4999   case CallingConv::Swift:
5000     return true;
5001   default:
5002     return canGuaranteeTCO(CC);
5003   }
5004 }
5005 
5006 bool AArch64TargetLowering::isEligibleForTailCallOptimization(
5007     SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
5008     const SmallVectorImpl<ISD::OutputArg> &Outs,
5009     const SmallVectorImpl<SDValue> &OutVals,
5010     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
5011   if (!mayTailCallThisCC(CalleeCC))
5012     return false;
5013 
5014   MachineFunction &MF = DAG.getMachineFunction();
5015   const Function &CallerF = MF.getFunction();
5016   CallingConv::ID CallerCC = CallerF.getCallingConv();
5017 
5018   // If this function uses the C calling convention but has an SVE signature,
5019   // then it preserves more registers and should assume the SVE_VectorCall CC.
5020   // The check for matching callee-saved regs will determine whether it is
5021   // eligible for TCO.
5022   if (CallerCC == CallingConv::C &&
5023       AArch64RegisterInfo::hasSVEArgsOrReturn(&MF))
5024     CallerCC = CallingConv::AArch64_SVE_VectorCall;
5025 
5026   bool CCMatch = CallerCC == CalleeCC;
5027 
5028   // When using the Windows calling convention on a non-windows OS, we want
5029   // to back up and restore X18 in such functions; we can't do a tail call
5030   // from those functions.
5031   if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
5032       CalleeCC != CallingConv::Win64)
5033     return false;
5034 
5035   // Byval parameters hand the function a pointer directly into the stack area
5036   // we want to reuse during a tail call. Working around this *is* possible (see
5037   // X86) but less efficient and uglier in LowerCall.
5038   for (Function::const_arg_iterator i = CallerF.arg_begin(),
5039                                     e = CallerF.arg_end();
5040        i != e; ++i) {
5041     if (i->hasByValAttr())
5042       return false;
5043 
5044     // On Windows, "inreg" attributes signify non-aggregate indirect returns.
5045     // In this case, it is necessary to save/restore X0 in the callee. Tail
5046     // call opt interferes with this. So we disable tail call opt when the
5047     // caller has an argument with "inreg" attribute.
5048 
5049     // FIXME: Check whether the callee also has an "inreg" argument.
5050     if (i->hasInRegAttr())
5051       return false;
5052   }
5053 
5054   if (getTargetMachine().Options.GuaranteedTailCallOpt)
5055     return canGuaranteeTCO(CalleeCC) && CCMatch;
5056 
5057   // Externally-defined functions with weak linkage should not be
5058   // tail-called on AArch64 when the OS does not support dynamic
5059   // pre-emption of symbols, as the AAELF spec requires normal calls
5060   // to undefined weak functions to be replaced with a NOP or jump to the
5061   // next instruction. The behaviour of branch instructions in this
5062   // situation (as used for tail calls) is implementation-defined, so we
5063   // cannot rely on the linker replacing the tail call with a return.
5064   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
5065     const GlobalValue *GV = G->getGlobal();
5066     const Triple &TT = getTargetMachine().getTargetTriple();
5067     if (GV->hasExternalWeakLinkage() &&
5068         (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
5069       return false;
5070   }
5071 
5072   // Now we search for cases where we can use a tail call without changing the
5073   // ABI. Sibcall is used in some places (particularly gcc) to refer to this
5074   // concept.
5075 
5076   // I want anyone implementing a new calling convention to think long and hard
5077   // about this assert.
5078   assert((!isVarArg || CalleeCC == CallingConv::C) &&
5079          "Unexpected variadic calling convention");
5080 
5081   LLVMContext &C = *DAG.getContext();
5082   if (isVarArg && !Outs.empty()) {
5083     // At least two cases here: if caller is fastcc then we can't have any
5084     // memory arguments (we'd be expected to clean up the stack afterwards). If
5085     // caller is C then we could potentially use its argument area.
5086 
5087     // FIXME: for now we take the most conservative of these in both cases:
5088     // disallow all variadic memory operands.
5089     SmallVector<CCValAssign, 16> ArgLocs;
5090     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
5091 
5092     CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
5093     for (const CCValAssign &ArgLoc : ArgLocs)
5094       if (!ArgLoc.isRegLoc())
5095         return false;
5096   }
5097 
5098   // Check that the call results are passed in the same way.
5099   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
5100                                   CCAssignFnForCall(CalleeCC, isVarArg),
5101                                   CCAssignFnForCall(CallerCC, isVarArg)))
5102     return false;
5103   // The callee has to preserve all registers the caller needs to preserve.
5104   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
5105   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
5106   if (!CCMatch) {
5107     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
5108     if (Subtarget->hasCustomCallingConv()) {
5109       TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
5110       TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
5111     }
5112     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
5113       return false;
5114   }
5115 
5116   // Nothing more to check if the callee is taking no arguments
5117   if (Outs.empty())
5118     return true;
5119 
5120   SmallVector<CCValAssign, 16> ArgLocs;
5121   CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
5122 
5123   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
5124 
5125   const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
5126 
5127   // If any of the arguments is passed indirectly, it must be SVE, so the
5128   // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
5129   // allocate space on the stack. That is why we determine this explicitly here
5130   // the call cannot be a tailcall.
5131   if (llvm::any_of(ArgLocs, [](CCValAssign &A) {
5132         assert((A.getLocInfo() != CCValAssign::Indirect ||
5133                 A.getValVT().isScalableVector()) &&
5134                "Expected value to be scalable");
5135         return A.getLocInfo() == CCValAssign::Indirect;
5136       }))
5137     return false;
5138 
5139   // If the stack arguments for this call do not fit into our own save area then
5140   // the call cannot be made tail.
5141   if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
5142     return false;
5143 
5144   const MachineRegisterInfo &MRI = MF.getRegInfo();
5145   if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
5146     return false;
5147 
5148   return true;
5149 }
5150 
5151 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
5152                                                    SelectionDAG &DAG,
5153                                                    MachineFrameInfo &MFI,
5154                                                    int ClobberedFI) const {
5155   SmallVector<SDValue, 8> ArgChains;
5156   int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
5157   int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
5158 
5159   // Include the original chain at the beginning of the list. When this is
5160   // used by target LowerCall hooks, this helps legalize find the
5161   // CALLSEQ_BEGIN node.
5162   ArgChains.push_back(Chain);
5163 
5164   // Add a chain value for each stack argument corresponding
5165   for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
5166                             UE = DAG.getEntryNode().getNode()->use_end();
5167        U != UE; ++U)
5168     if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
5169       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
5170         if (FI->getIndex() < 0) {
5171           int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
5172           int64_t InLastByte = InFirstByte;
5173           InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
5174 
5175           if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
5176               (FirstByte <= InFirstByte && InFirstByte <= LastByte))
5177             ArgChains.push_back(SDValue(L, 1));
5178         }
5179 
5180   // Build a tokenfactor for all the chains.
5181   return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
5182 }
5183 
5184 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
5185                                                    bool TailCallOpt) const {
5186   return CallCC == CallingConv::Fast && TailCallOpt;
5187 }
5188 
5189 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
5190 /// and add input and output parameter nodes.
5191 SDValue
5192 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
5193                                  SmallVectorImpl<SDValue> &InVals) const {
5194   SelectionDAG &DAG = CLI.DAG;
5195   SDLoc &DL = CLI.DL;
5196   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
5197   SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
5198   SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
5199   SDValue Chain = CLI.Chain;
5200   SDValue Callee = CLI.Callee;
5201   bool &IsTailCall = CLI.IsTailCall;
5202   CallingConv::ID CallConv = CLI.CallConv;
5203   bool IsVarArg = CLI.IsVarArg;
5204 
5205   MachineFunction &MF = DAG.getMachineFunction();
5206   MachineFunction::CallSiteInfo CSInfo;
5207   bool IsThisReturn = false;
5208 
5209   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
5210   bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
5211   bool IsSibCall = false;
5212 
5213   // Check callee args/returns for SVE registers and set calling convention
5214   // accordingly.
5215   if (CallConv == CallingConv::C) {
5216     bool CalleeOutSVE = any_of(Outs, [](ISD::OutputArg &Out){
5217       return Out.VT.isScalableVector();
5218     });
5219     bool CalleeInSVE = any_of(Ins, [](ISD::InputArg &In){
5220       return In.VT.isScalableVector();
5221     });
5222 
5223     if (CalleeInSVE || CalleeOutSVE)
5224       CallConv = CallingConv::AArch64_SVE_VectorCall;
5225   }
5226 
5227   if (IsTailCall) {
5228     // Check if it's really possible to do a tail call.
5229     IsTailCall = isEligibleForTailCallOptimization(
5230         Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
5231     if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
5232       report_fatal_error("failed to perform tail call elimination on a call "
5233                          "site marked musttail");
5234 
5235     // A sibling call is one where we're under the usual C ABI and not planning
5236     // to change that but can still do a tail call:
5237     if (!TailCallOpt && IsTailCall)
5238       IsSibCall = true;
5239 
5240     if (IsTailCall)
5241       ++NumTailCalls;
5242   }
5243 
5244   // Analyze operands of the call, assigning locations to each operand.
5245   SmallVector<CCValAssign, 16> ArgLocs;
5246   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
5247                  *DAG.getContext());
5248 
5249   if (IsVarArg) {
5250     // Handle fixed and variable vector arguments differently.
5251     // Variable vector arguments always go into memory.
5252     unsigned NumArgs = Outs.size();
5253 
5254     for (unsigned i = 0; i != NumArgs; ++i) {
5255       MVT ArgVT = Outs[i].VT;
5256       if (!Outs[i].IsFixed && ArgVT.isScalableVector())
5257         report_fatal_error("Passing SVE types to variadic functions is "
5258                            "currently not supported");
5259 
5260       ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
5261       CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
5262                                                /*IsVarArg=*/ !Outs[i].IsFixed);
5263       bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
5264       assert(!Res && "Call operand has unhandled type");
5265       (void)Res;
5266     }
5267   } else {
5268     // At this point, Outs[].VT may already be promoted to i32. To correctly
5269     // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
5270     // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
5271     // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
5272     // we use a special version of AnalyzeCallOperands to pass in ValVT and
5273     // LocVT.
5274     unsigned NumArgs = Outs.size();
5275     for (unsigned i = 0; i != NumArgs; ++i) {
5276       MVT ValVT = Outs[i].VT;
5277       // Get type of the original argument.
5278       EVT ActualVT = getValueType(DAG.getDataLayout(),
5279                                   CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
5280                                   /*AllowUnknown*/ true);
5281       MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
5282       ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
5283       // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
5284       if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
5285         ValVT = MVT::i8;
5286       else if (ActualMVT == MVT::i16)
5287         ValVT = MVT::i16;
5288 
5289       CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
5290       bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
5291       assert(!Res && "Call operand has unhandled type");
5292       (void)Res;
5293     }
5294   }
5295 
5296   // Get a count of how many bytes are to be pushed on the stack.
5297   unsigned NumBytes = CCInfo.getNextStackOffset();
5298 
5299   if (IsSibCall) {
5300     // Since we're not changing the ABI to make this a tail call, the memory
5301     // operands are already available in the caller's incoming argument space.
5302     NumBytes = 0;
5303   }
5304 
5305   // FPDiff is the byte offset of the call's argument area from the callee's.
5306   // Stores to callee stack arguments will be placed in FixedStackSlots offset
5307   // by this amount for a tail call. In a sibling call it must be 0 because the
5308   // caller will deallocate the entire stack and the callee still expects its
5309   // arguments to begin at SP+0. Completely unused for non-tail calls.
5310   int FPDiff = 0;
5311 
5312   if (IsTailCall && !IsSibCall) {
5313     unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
5314 
5315     // Since callee will pop argument stack as a tail call, we must keep the
5316     // popped size 16-byte aligned.
5317     NumBytes = alignTo(NumBytes, 16);
5318 
5319     // FPDiff will be negative if this tail call requires more space than we
5320     // would automatically have in our incoming argument space. Positive if we
5321     // can actually shrink the stack.
5322     FPDiff = NumReusableBytes - NumBytes;
5323 
5324     // The stack pointer must be 16-byte aligned at all times it's used for a
5325     // memory operation, which in practice means at *all* times and in
5326     // particular across call boundaries. Therefore our own arguments started at
5327     // a 16-byte aligned SP and the delta applied for the tail call should
5328     // satisfy the same constraint.
5329     assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
5330   }
5331 
5332   // Adjust the stack pointer for the new arguments...
5333   // These operations are automatically eliminated by the prolog/epilog pass
5334   if (!IsSibCall)
5335     Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
5336 
5337   SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
5338                                         getPointerTy(DAG.getDataLayout()));
5339 
5340   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
5341   SmallSet<unsigned, 8> RegsUsed;
5342   SmallVector<SDValue, 8> MemOpChains;
5343   auto PtrVT = getPointerTy(DAG.getDataLayout());
5344 
5345   if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
5346     const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
5347     for (const auto &F : Forwards) {
5348       SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
5349        RegsToPass.emplace_back(F.PReg, Val);
5350     }
5351   }
5352 
5353   // Walk the register/memloc assignments, inserting copies/loads.
5354   unsigned ExtraArgLocs = 0;
5355   for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
5356     CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
5357     SDValue Arg = OutVals[i];
5358     ISD::ArgFlagsTy Flags = Outs[i].Flags;
5359 
5360     // Promote the value if needed.
5361     switch (VA.getLocInfo()) {
5362     default:
5363       llvm_unreachable("Unknown loc info!");
5364     case CCValAssign::Full:
5365       break;
5366     case CCValAssign::SExt:
5367       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
5368       break;
5369     case CCValAssign::ZExt:
5370       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
5371       break;
5372     case CCValAssign::AExt:
5373       if (Outs[i].ArgVT == MVT::i1) {
5374         // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
5375         Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
5376         Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
5377       }
5378       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
5379       break;
5380     case CCValAssign::AExtUpper:
5381       assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
5382       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
5383       Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
5384                         DAG.getConstant(32, DL, VA.getLocVT()));
5385       break;
5386     case CCValAssign::BCvt:
5387       Arg = DAG.getBitcast(VA.getLocVT(), Arg);
5388       break;
5389     case CCValAssign::Trunc:
5390       Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
5391       break;
5392     case CCValAssign::FPExt:
5393       Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
5394       break;
5395     case CCValAssign::Indirect:
5396       assert(VA.getValVT().isScalableVector() &&
5397              "Only scalable vectors can be passed indirectly");
5398 
5399       uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinSize();
5400       uint64_t PartSize = StoreSize;
5401       unsigned NumParts = 1;
5402       if (Outs[i].Flags.isInConsecutiveRegs()) {
5403         assert(!Outs[i].Flags.isInConsecutiveRegsLast());
5404         while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
5405           ++NumParts;
5406         StoreSize *= NumParts;
5407       }
5408 
5409       MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
5410       Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
5411       Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
5412       int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
5413       MFI.setStackID(FI, TargetStackID::ScalableVector);
5414 
5415       MachinePointerInfo MPI =
5416           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
5417       SDValue Ptr = DAG.getFrameIndex(
5418           FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
5419       SDValue SpillSlot = Ptr;
5420 
5421       // Ensure we generate all stores for each tuple part, whilst updating the
5422       // pointer after each store correctly using vscale.
5423       while (NumParts) {
5424         Chain = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
5425         NumParts--;
5426         if (NumParts > 0) {
5427           SDValue BytesIncrement = DAG.getVScale(
5428               DL, Ptr.getValueType(),
5429               APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize));
5430           SDNodeFlags Flags;
5431           Flags.setNoUnsignedWrap(true);
5432 
5433           MPI = MachinePointerInfo(MPI.getAddrSpace());
5434           Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
5435                             BytesIncrement, Flags);
5436           ExtraArgLocs++;
5437           i++;
5438         }
5439       }
5440 
5441       Arg = SpillSlot;
5442       break;
5443     }
5444 
5445     if (VA.isRegLoc()) {
5446       if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
5447           Outs[0].VT == MVT::i64) {
5448         assert(VA.getLocVT() == MVT::i64 &&
5449                "unexpected calling convention register assignment");
5450         assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
5451                "unexpected use of 'returned'");
5452         IsThisReturn = true;
5453       }
5454       if (RegsUsed.count(VA.getLocReg())) {
5455         // If this register has already been used then we're trying to pack
5456         // parts of an [N x i32] into an X-register. The extension type will
5457         // take care of putting the two halves in the right place but we have to
5458         // combine them.
5459         SDValue &Bits =
5460             llvm::find_if(RegsToPass,
5461                           [=](const std::pair<unsigned, SDValue> &Elt) {
5462                             return Elt.first == VA.getLocReg();
5463                           })
5464                 ->second;
5465         Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
5466         // Call site info is used for function's parameter entry value
5467         // tracking. For now we track only simple cases when parameter
5468         // is transferred through whole register.
5469         llvm::erase_if(CSInfo, [&VA](MachineFunction::ArgRegPair ArgReg) {
5470           return ArgReg.Reg == VA.getLocReg();
5471         });
5472       } else {
5473         RegsToPass.emplace_back(VA.getLocReg(), Arg);
5474         RegsUsed.insert(VA.getLocReg());
5475         const TargetOptions &Options = DAG.getTarget().Options;
5476         if (Options.EmitCallSiteInfo)
5477           CSInfo.emplace_back(VA.getLocReg(), i);
5478       }
5479     } else {
5480       assert(VA.isMemLoc());
5481 
5482       SDValue DstAddr;
5483       MachinePointerInfo DstInfo;
5484 
5485       // FIXME: This works on big-endian for composite byvals, which are the
5486       // common case. It should also work for fundamental types too.
5487       uint32_t BEAlign = 0;
5488       unsigned OpSize;
5489       if (VA.getLocInfo() == CCValAssign::Indirect)
5490         OpSize = VA.getLocVT().getFixedSizeInBits();
5491       else
5492         OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
5493                                  : VA.getValVT().getSizeInBits();
5494       OpSize = (OpSize + 7) / 8;
5495       if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
5496           !Flags.isInConsecutiveRegs()) {
5497         if (OpSize < 8)
5498           BEAlign = 8 - OpSize;
5499       }
5500       unsigned LocMemOffset = VA.getLocMemOffset();
5501       int32_t Offset = LocMemOffset + BEAlign;
5502       SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
5503       PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
5504 
5505       if (IsTailCall) {
5506         Offset = Offset + FPDiff;
5507         int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
5508 
5509         DstAddr = DAG.getFrameIndex(FI, PtrVT);
5510         DstInfo =
5511             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
5512 
5513         // Make sure any stack arguments overlapping with where we're storing
5514         // are loaded before this eventual operation. Otherwise they'll be
5515         // clobbered.
5516         Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
5517       } else {
5518         SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
5519 
5520         DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
5521         DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(),
5522                                                LocMemOffset);
5523       }
5524 
5525       if (Outs[i].Flags.isByVal()) {
5526         SDValue SizeNode =
5527             DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
5528         SDValue Cpy = DAG.getMemcpy(
5529             Chain, DL, DstAddr, Arg, SizeNode,
5530             Outs[i].Flags.getNonZeroByValAlign(),
5531             /*isVol = */ false, /*AlwaysInline = */ false,
5532             /*isTailCall = */ false, DstInfo, MachinePointerInfo());
5533 
5534         MemOpChains.push_back(Cpy);
5535       } else {
5536         // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
5537         // promoted to a legal register type i32, we should truncate Arg back to
5538         // i1/i8/i16.
5539         if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
5540             VA.getValVT() == MVT::i16)
5541           Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
5542 
5543         SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
5544         MemOpChains.push_back(Store);
5545       }
5546     }
5547   }
5548 
5549   if (!MemOpChains.empty())
5550     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
5551 
5552   // Build a sequence of copy-to-reg nodes chained together with token chain
5553   // and flag operands which copy the outgoing args into the appropriate regs.
5554   SDValue InFlag;
5555   for (auto &RegToPass : RegsToPass) {
5556     Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
5557                              RegToPass.second, InFlag);
5558     InFlag = Chain.getValue(1);
5559   }
5560 
5561   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
5562   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
5563   // node so that legalize doesn't hack it.
5564   if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
5565     auto GV = G->getGlobal();
5566     unsigned OpFlags =
5567         Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine());
5568     if (OpFlags & AArch64II::MO_GOT) {
5569       Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
5570       Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
5571     } else {
5572       const GlobalValue *GV = G->getGlobal();
5573       Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
5574     }
5575   } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
5576     if (getTargetMachine().getCodeModel() == CodeModel::Large &&
5577         Subtarget->isTargetMachO()) {
5578       const char *Sym = S->getSymbol();
5579       Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
5580       Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
5581     } else {
5582       const char *Sym = S->getSymbol();
5583       Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
5584     }
5585   }
5586 
5587   // We don't usually want to end the call-sequence here because we would tidy
5588   // the frame up *after* the call, however in the ABI-changing tail-call case
5589   // we've carefully laid out the parameters so that when sp is reset they'll be
5590   // in the correct location.
5591   if (IsTailCall && !IsSibCall) {
5592     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
5593                                DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
5594     InFlag = Chain.getValue(1);
5595   }
5596 
5597   std::vector<SDValue> Ops;
5598   Ops.push_back(Chain);
5599   Ops.push_back(Callee);
5600 
5601   if (IsTailCall) {
5602     // Each tail call may have to adjust the stack by a different amount, so
5603     // this information must travel along with the operation for eventual
5604     // consumption by emitEpilogue.
5605     Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
5606   }
5607 
5608   // Add argument registers to the end of the list so that they are known live
5609   // into the call.
5610   for (auto &RegToPass : RegsToPass)
5611     Ops.push_back(DAG.getRegister(RegToPass.first,
5612                                   RegToPass.second.getValueType()));
5613 
5614   // Add a register mask operand representing the call-preserved registers.
5615   const uint32_t *Mask;
5616   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
5617   if (IsThisReturn) {
5618     // For 'this' returns, use the X0-preserving mask if applicable
5619     Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
5620     if (!Mask) {
5621       IsThisReturn = false;
5622       Mask = TRI->getCallPreservedMask(MF, CallConv);
5623     }
5624   } else
5625     Mask = TRI->getCallPreservedMask(MF, CallConv);
5626 
5627   if (Subtarget->hasCustomCallingConv())
5628     TRI->UpdateCustomCallPreservedMask(MF, &Mask);
5629 
5630   if (TRI->isAnyArgRegReserved(MF))
5631     TRI->emitReservedArgRegCallError(MF);
5632 
5633   assert(Mask && "Missing call preserved mask for calling convention");
5634   Ops.push_back(DAG.getRegisterMask(Mask));
5635 
5636   if (InFlag.getNode())
5637     Ops.push_back(InFlag);
5638 
5639   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
5640 
5641   // If we're doing a tall call, use a TC_RETURN here rather than an
5642   // actual call instruction.
5643   if (IsTailCall) {
5644     MF.getFrameInfo().setHasTailCall();
5645     SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
5646     DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
5647     return Ret;
5648   }
5649 
5650   unsigned CallOpc = AArch64ISD::CALL;
5651   // Calls marked with "rv_marker" are special. They should be expanded to the
5652   // call, directly followed by a special marker sequence. Use the CALL_RVMARKER
5653   // to do that.
5654   if (CLI.CB && CLI.CB->hasRetAttr("rv_marker")) {
5655     assert(!IsTailCall && "tail calls cannot be marked with rv_marker");
5656     CallOpc = AArch64ISD::CALL_RVMARKER;
5657   }
5658 
5659   // Returns a chain and a flag for retval copy to use.
5660   Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops);
5661   DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
5662   InFlag = Chain.getValue(1);
5663   DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
5664 
5665   uint64_t CalleePopBytes =
5666       DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
5667 
5668   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
5669                              DAG.getIntPtrConstant(CalleePopBytes, DL, true),
5670                              InFlag, DL);
5671   if (!Ins.empty())
5672     InFlag = Chain.getValue(1);
5673 
5674   // Handle result values, copying them out of physregs into vregs that we
5675   // return.
5676   return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
5677                          InVals, IsThisReturn,
5678                          IsThisReturn ? OutVals[0] : SDValue());
5679 }
5680 
5681 bool AArch64TargetLowering::CanLowerReturn(
5682     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
5683     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
5684   CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
5685   SmallVector<CCValAssign, 16> RVLocs;
5686   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
5687   return CCInfo.CheckReturn(Outs, RetCC);
5688 }
5689 
5690 SDValue
5691 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
5692                                    bool isVarArg,
5693                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
5694                                    const SmallVectorImpl<SDValue> &OutVals,
5695                                    const SDLoc &DL, SelectionDAG &DAG) const {
5696   auto &MF = DAG.getMachineFunction();
5697   auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
5698 
5699   CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
5700   SmallVector<CCValAssign, 16> RVLocs;
5701   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
5702                  *DAG.getContext());
5703   CCInfo.AnalyzeReturn(Outs, RetCC);
5704 
5705   // Copy the result values into the output registers.
5706   SDValue Flag;
5707   SmallVector<std::pair<unsigned, SDValue>, 4> RetVals;
5708   SmallSet<unsigned, 4> RegsUsed;
5709   for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
5710        ++i, ++realRVLocIdx) {
5711     CCValAssign &VA = RVLocs[i];
5712     assert(VA.isRegLoc() && "Can only return in registers!");
5713     SDValue Arg = OutVals[realRVLocIdx];
5714 
5715     switch (VA.getLocInfo()) {
5716     default:
5717       llvm_unreachable("Unknown loc info!");
5718     case CCValAssign::Full:
5719       if (Outs[i].ArgVT == MVT::i1) {
5720         // AAPCS requires i1 to be zero-extended to i8 by the producer of the
5721         // value. This is strictly redundant on Darwin (which uses "zeroext
5722         // i1"), but will be optimised out before ISel.
5723         Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
5724         Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
5725       }
5726       break;
5727     case CCValAssign::BCvt:
5728       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
5729       break;
5730     case CCValAssign::AExt:
5731     case CCValAssign::ZExt:
5732       Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
5733       break;
5734     case CCValAssign::AExtUpper:
5735       assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
5736       Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
5737       Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
5738                         DAG.getConstant(32, DL, VA.getLocVT()));
5739       break;
5740     }
5741 
5742     if (RegsUsed.count(VA.getLocReg())) {
5743       SDValue &Bits =
5744           llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
5745             return Elt.first == VA.getLocReg();
5746           })->second;
5747       Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
5748     } else {
5749       RetVals.emplace_back(VA.getLocReg(), Arg);
5750       RegsUsed.insert(VA.getLocReg());
5751     }
5752   }
5753 
5754   SmallVector<SDValue, 4> RetOps(1, Chain);
5755   for (auto &RetVal : RetVals) {
5756     Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Flag);
5757     Flag = Chain.getValue(1);
5758     RetOps.push_back(
5759         DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
5760   }
5761 
5762   // Windows AArch64 ABIs require that for returning structs by value we copy
5763   // the sret argument into X0 for the return.
5764   // We saved the argument into a virtual register in the entry block,
5765   // so now we copy the value out and into X0.
5766   if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
5767     SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
5768                                      getPointerTy(MF.getDataLayout()));
5769 
5770     unsigned RetValReg = AArch64::X0;
5771     Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Flag);
5772     Flag = Chain.getValue(1);
5773 
5774     RetOps.push_back(
5775       DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
5776   }
5777 
5778   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
5779   const MCPhysReg *I =
5780       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
5781   if (I) {
5782     for (; *I; ++I) {
5783       if (AArch64::GPR64RegClass.contains(*I))
5784         RetOps.push_back(DAG.getRegister(*I, MVT::i64));
5785       else if (AArch64::FPR64RegClass.contains(*I))
5786         RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
5787       else
5788         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
5789     }
5790   }
5791 
5792   RetOps[0] = Chain; // Update chain.
5793 
5794   // Add the flag if we have it.
5795   if (Flag.getNode())
5796     RetOps.push_back(Flag);
5797 
5798   return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
5799 }
5800 
5801 //===----------------------------------------------------------------------===//
5802 //  Other Lowering Code
5803 //===----------------------------------------------------------------------===//
5804 
5805 SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
5806                                              SelectionDAG &DAG,
5807                                              unsigned Flag) const {
5808   return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
5809                                     N->getOffset(), Flag);
5810 }
5811 
5812 SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
5813                                              SelectionDAG &DAG,
5814                                              unsigned Flag) const {
5815   return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
5816 }
5817 
5818 SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
5819                                              SelectionDAG &DAG,
5820                                              unsigned Flag) const {
5821   return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
5822                                    N->getOffset(), Flag);
5823 }
5824 
5825 SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
5826                                              SelectionDAG &DAG,
5827                                              unsigned Flag) const {
5828   return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
5829 }
5830 
5831 // (loadGOT sym)
5832 template <class NodeTy>
5833 SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
5834                                       unsigned Flags) const {
5835   LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
5836   SDLoc DL(N);
5837   EVT Ty = getPointerTy(DAG.getDataLayout());
5838   SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
5839   // FIXME: Once remat is capable of dealing with instructions with register
5840   // operands, expand this into two nodes instead of using a wrapper node.
5841   return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
5842 }
5843 
5844 // (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
5845 template <class NodeTy>
5846 SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
5847                                             unsigned Flags) const {
5848   LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
5849   SDLoc DL(N);
5850   EVT Ty = getPointerTy(DAG.getDataLayout());
5851   const unsigned char MO_NC = AArch64II::MO_NC;
5852   return DAG.getNode(
5853       AArch64ISD::WrapperLarge, DL, Ty,
5854       getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
5855       getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
5856       getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
5857       getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
5858 }
5859 
5860 // (addlow (adrp %hi(sym)) %lo(sym))
5861 template <class NodeTy>
5862 SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
5863                                        unsigned Flags) const {
5864   LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
5865   SDLoc DL(N);
5866   EVT Ty = getPointerTy(DAG.getDataLayout());
5867   SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
5868   SDValue Lo = getTargetNode(N, Ty, DAG,
5869                              AArch64II::MO_PAGEOFF | AArch64II::MO_NC | Flags);
5870   SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
5871   return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
5872 }
5873 
5874 // (adr sym)
5875 template <class NodeTy>
5876 SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
5877                                            unsigned Flags) const {
5878   LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
5879   SDLoc DL(N);
5880   EVT Ty = getPointerTy(DAG.getDataLayout());
5881   SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
5882   return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
5883 }
5884 
5885 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
5886                                                   SelectionDAG &DAG) const {
5887   GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
5888   const GlobalValue *GV = GN->getGlobal();
5889   unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
5890 
5891   if (OpFlags != AArch64II::MO_NO_FLAG)
5892     assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
5893            "unexpected offset in global node");
5894 
5895   // This also catches the large code model case for Darwin, and tiny code
5896   // model with got relocations.
5897   if ((OpFlags & AArch64II::MO_GOT) != 0) {
5898     return getGOT(GN, DAG, OpFlags);
5899   }
5900 
5901   SDValue Result;
5902   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
5903     Result = getAddrLarge(GN, DAG, OpFlags);
5904   } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
5905     Result = getAddrTiny(GN, DAG, OpFlags);
5906   } else {
5907     Result = getAddr(GN, DAG, OpFlags);
5908   }
5909   EVT PtrVT = getPointerTy(DAG.getDataLayout());
5910   SDLoc DL(GN);
5911   if (OpFlags & (AArch64II::MO_DLLIMPORT | AArch64II::MO_COFFSTUB))
5912     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
5913                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
5914   return Result;
5915 }
5916 
5917 /// Convert a TLS address reference into the correct sequence of loads
5918 /// and calls to compute the variable's address (for Darwin, currently) and
5919 /// return an SDValue containing the final node.
5920 
5921 /// Darwin only has one TLS scheme which must be capable of dealing with the
5922 /// fully general situation, in the worst case. This means:
5923 ///     + "extern __thread" declaration.
5924 ///     + Defined in a possibly unknown dynamic library.
5925 ///
5926 /// The general system is that each __thread variable has a [3 x i64] descriptor
5927 /// which contains information used by the runtime to calculate the address. The
5928 /// only part of this the compiler needs to know about is the first xword, which
5929 /// contains a function pointer that must be called with the address of the
5930 /// entire descriptor in "x0".
5931 ///
5932 /// Since this descriptor may be in a different unit, in general even the
5933 /// descriptor must be accessed via an indirect load. The "ideal" code sequence
5934 /// is:
5935 ///     adrp x0, _var@TLVPPAGE
5936 ///     ldr x0, [x0, _var@TLVPPAGEOFF]   ; x0 now contains address of descriptor
5937 ///     ldr x1, [x0]                     ; x1 contains 1st entry of descriptor,
5938 ///                                      ; the function pointer
5939 ///     blr x1                           ; Uses descriptor address in x0
5940 ///     ; Address of _var is now in x0.
5941 ///
5942 /// If the address of _var's descriptor *is* known to the linker, then it can
5943 /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
5944 /// a slight efficiency gain.
5945 SDValue
5946 AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
5947                                                    SelectionDAG &DAG) const {
5948   assert(Subtarget->isTargetDarwin() &&
5949          "This function expects a Darwin target");
5950 
5951   SDLoc DL(Op);
5952   MVT PtrVT = getPointerTy(DAG.getDataLayout());
5953   MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
5954   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
5955 
5956   SDValue TLVPAddr =
5957       DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
5958   SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
5959 
5960   // The first entry in the descriptor is a function pointer that we must call
5961   // to obtain the address of the variable.
5962   SDValue Chain = DAG.getEntryNode();
5963   SDValue FuncTLVGet = DAG.getLoad(
5964       PtrMemVT, DL, Chain, DescAddr,
5965       MachinePointerInfo::getGOT(DAG.getMachineFunction()),
5966       Align(PtrMemVT.getSizeInBits() / 8),
5967       MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
5968   Chain = FuncTLVGet.getValue(1);
5969 
5970   // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
5971   FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
5972 
5973   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
5974   MFI.setAdjustsStack(true);
5975 
5976   // TLS calls preserve all registers except those that absolutely must be
5977   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
5978   // silly).
5979   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
5980   const uint32_t *Mask = TRI->getTLSCallPreservedMask();
5981   if (Subtarget->hasCustomCallingConv())
5982     TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
5983 
5984   // Finally, we can make the call. This is just a degenerate version of a
5985   // normal AArch64 call node: x0 takes the address of the descriptor, and
5986   // returns the address of the variable in this thread.
5987   Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
5988   Chain =
5989       DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
5990                   Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
5991                   DAG.getRegisterMask(Mask), Chain.getValue(1));
5992   return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
5993 }
5994 
5995 /// Convert a thread-local variable reference into a sequence of instructions to
5996 /// compute the variable's address for the local exec TLS model of ELF targets.
5997 /// The sequence depends on the maximum TLS area size.
5998 SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
5999                                                     SDValue ThreadBase,
6000                                                     const SDLoc &DL,
6001                                                     SelectionDAG &DAG) const {
6002   EVT PtrVT = getPointerTy(DAG.getDataLayout());
6003   SDValue TPOff, Addr;
6004 
6005   switch (DAG.getTarget().Options.TLSSize) {
6006   default:
6007     llvm_unreachable("Unexpected TLS size");
6008 
6009   case 12: {
6010     // mrs   x0, TPIDR_EL0
6011     // add   x0, x0, :tprel_lo12:a
6012     SDValue Var = DAG.getTargetGlobalAddress(
6013         GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
6014     return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
6015                                       Var,
6016                                       DAG.getTargetConstant(0, DL, MVT::i32)),
6017                    0);
6018   }
6019 
6020   case 24: {
6021     // mrs   x0, TPIDR_EL0
6022     // add   x0, x0, :tprel_hi12:a
6023     // add   x0, x0, :tprel_lo12_nc:a
6024     SDValue HiVar = DAG.getTargetGlobalAddress(
6025         GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
6026     SDValue LoVar = DAG.getTargetGlobalAddress(
6027         GV, DL, PtrVT, 0,
6028         AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
6029     Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
6030                                       HiVar,
6031                                       DAG.getTargetConstant(0, DL, MVT::i32)),
6032                    0);
6033     return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
6034                                       LoVar,
6035                                       DAG.getTargetConstant(0, DL, MVT::i32)),
6036                    0);
6037   }
6038 
6039   case 32: {
6040     // mrs   x1, TPIDR_EL0
6041     // movz  x0, #:tprel_g1:a
6042     // movk  x0, #:tprel_g0_nc:a
6043     // add   x0, x1, x0
6044     SDValue HiVar = DAG.getTargetGlobalAddress(
6045         GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
6046     SDValue LoVar = DAG.getTargetGlobalAddress(
6047         GV, DL, PtrVT, 0,
6048         AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
6049     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
6050                                        DAG.getTargetConstant(16, DL, MVT::i32)),
6051                     0);
6052     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
6053                                        DAG.getTargetConstant(0, DL, MVT::i32)),
6054                     0);
6055     return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
6056   }
6057 
6058   case 48: {
6059     // mrs   x1, TPIDR_EL0
6060     // movz  x0, #:tprel_g2:a
6061     // movk  x0, #:tprel_g1_nc:a
6062     // movk  x0, #:tprel_g0_nc:a
6063     // add   x0, x1, x0
6064     SDValue HiVar = DAG.getTargetGlobalAddress(
6065         GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
6066     SDValue MiVar = DAG.getTargetGlobalAddress(
6067         GV, DL, PtrVT, 0,
6068         AArch64II::MO_TLS | AArch64II::MO_G1 | AArch64II::MO_NC);
6069     SDValue LoVar = DAG.getTargetGlobalAddress(
6070         GV, DL, PtrVT, 0,
6071         AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
6072     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
6073                                        DAG.getTargetConstant(32, DL, MVT::i32)),
6074                     0);
6075     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
6076                                        DAG.getTargetConstant(16, DL, MVT::i32)),
6077                     0);
6078     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
6079                                        DAG.getTargetConstant(0, DL, MVT::i32)),
6080                     0);
6081     return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
6082   }
6083   }
6084 }
6085 
6086 /// When accessing thread-local variables under either the general-dynamic or
6087 /// local-dynamic system, we make a "TLS-descriptor" call. The variable will
6088 /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
6089 /// is a function pointer to carry out the resolution.
6090 ///
6091 /// The sequence is:
6092 ///    adrp  x0, :tlsdesc:var
6093 ///    ldr   x1, [x0, #:tlsdesc_lo12:var]
6094 ///    add   x0, x0, #:tlsdesc_lo12:var
6095 ///    .tlsdesccall var
6096 ///    blr   x1
6097 ///    (TPIDR_EL0 offset now in x0)
6098 ///
6099 ///  The above sequence must be produced unscheduled, to enable the linker to
6100 ///  optimize/relax this sequence.
6101 ///  Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
6102 ///  above sequence, and expanded really late in the compilation flow, to ensure
6103 ///  the sequence is produced as per above.
6104 SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
6105                                                       const SDLoc &DL,
6106                                                       SelectionDAG &DAG) const {
6107   EVT PtrVT = getPointerTy(DAG.getDataLayout());
6108 
6109   SDValue Chain = DAG.getEntryNode();
6110   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
6111 
6112   Chain =
6113       DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
6114   SDValue Glue = Chain.getValue(1);
6115 
6116   return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
6117 }
6118 
6119 SDValue
6120 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
6121                                                 SelectionDAG &DAG) const {
6122   assert(Subtarget->isTargetELF() && "This function expects an ELF target");
6123 
6124   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
6125 
6126   TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
6127 
6128   if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
6129     if (Model == TLSModel::LocalDynamic)
6130       Model = TLSModel::GeneralDynamic;
6131   }
6132 
6133   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
6134       Model != TLSModel::LocalExec)
6135     report_fatal_error("ELF TLS only supported in small memory model or "
6136                        "in local exec TLS model");
6137   // Different choices can be made for the maximum size of the TLS area for a
6138   // module. For the small address model, the default TLS size is 16MiB and the
6139   // maximum TLS size is 4GiB.
6140   // FIXME: add tiny and large code model support for TLS access models other
6141   // than local exec. We currently generate the same code as small for tiny,
6142   // which may be larger than needed.
6143 
6144   SDValue TPOff;
6145   EVT PtrVT = getPointerTy(DAG.getDataLayout());
6146   SDLoc DL(Op);
6147   const GlobalValue *GV = GA->getGlobal();
6148 
6149   SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
6150 
6151   if (Model == TLSModel::LocalExec) {
6152     return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
6153   } else if (Model == TLSModel::InitialExec) {
6154     TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
6155     TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
6156   } else if (Model == TLSModel::LocalDynamic) {
6157     // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
6158     // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
6159     // the beginning of the module's TLS region, followed by a DTPREL offset
6160     // calculation.
6161 
6162     // These accesses will need deduplicating if there's more than one.
6163     AArch64FunctionInfo *MFI =
6164         DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
6165     MFI->incNumLocalDynamicTLSAccesses();
6166 
6167     // The call needs a relocation too for linker relaxation. It doesn't make
6168     // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
6169     // the address.
6170     SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
6171                                                   AArch64II::MO_TLS);
6172 
6173     // Now we can calculate the offset from TPIDR_EL0 to this module's
6174     // thread-local area.
6175     TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
6176 
6177     // Now use :dtprel_whatever: operations to calculate this variable's offset
6178     // in its thread-storage area.
6179     SDValue HiVar = DAG.getTargetGlobalAddress(
6180         GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
6181     SDValue LoVar = DAG.getTargetGlobalAddress(
6182         GV, DL, MVT::i64, 0,
6183         AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
6184 
6185     TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
6186                                        DAG.getTargetConstant(0, DL, MVT::i32)),
6187                     0);
6188     TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
6189                                        DAG.getTargetConstant(0, DL, MVT::i32)),
6190                     0);
6191   } else if (Model == TLSModel::GeneralDynamic) {
6192     // The call needs a relocation too for linker relaxation. It doesn't make
6193     // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
6194     // the address.
6195     SDValue SymAddr =
6196         DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
6197 
6198     // Finally we can make a call to calculate the offset from tpidr_el0.
6199     TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
6200   } else
6201     llvm_unreachable("Unsupported ELF TLS access model");
6202 
6203   return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
6204 }
6205 
6206 SDValue
6207 AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
6208                                                     SelectionDAG &DAG) const {
6209   assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
6210 
6211   SDValue Chain = DAG.getEntryNode();
6212   EVT PtrVT = getPointerTy(DAG.getDataLayout());
6213   SDLoc DL(Op);
6214 
6215   SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
6216 
6217   // Load the ThreadLocalStoragePointer from the TEB
6218   // A pointer to the TLS array is located at offset 0x58 from the TEB.
6219   SDValue TLSArray =
6220       DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
6221   TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
6222   Chain = TLSArray.getValue(1);
6223 
6224   // Load the TLS index from the C runtime;
6225   // This does the same as getAddr(), but without having a GlobalAddressSDNode.
6226   // This also does the same as LOADgot, but using a generic i32 load,
6227   // while LOADgot only loads i64.
6228   SDValue TLSIndexHi =
6229       DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
6230   SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
6231       "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
6232   SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
6233   SDValue TLSIndex =
6234       DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
6235   TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
6236   Chain = TLSIndex.getValue(1);
6237 
6238   // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
6239   // offset into the TLSArray.
6240   TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
6241   SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
6242                              DAG.getConstant(3, DL, PtrVT));
6243   SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
6244                             DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
6245                             MachinePointerInfo());
6246   Chain = TLS.getValue(1);
6247 
6248   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
6249   const GlobalValue *GV = GA->getGlobal();
6250   SDValue TGAHi = DAG.getTargetGlobalAddress(
6251       GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
6252   SDValue TGALo = DAG.getTargetGlobalAddress(
6253       GV, DL, PtrVT, 0,
6254       AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
6255 
6256   // Add the offset from the start of the .tls section (section base).
6257   SDValue Addr =
6258       SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
6259                                  DAG.getTargetConstant(0, DL, MVT::i32)),
6260               0);
6261   Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
6262   return Addr;
6263 }
6264 
6265 SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
6266                                                      SelectionDAG &DAG) const {
6267   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
6268   if (DAG.getTarget().useEmulatedTLS())
6269     return LowerToTLSEmulatedModel(GA, DAG);
6270 
6271   if (Subtarget->isTargetDarwin())
6272     return LowerDarwinGlobalTLSAddress(Op, DAG);
6273   if (Subtarget->isTargetELF())
6274     return LowerELFGlobalTLSAddress(Op, DAG);
6275   if (Subtarget->isTargetWindows())
6276     return LowerWindowsGlobalTLSAddress(Op, DAG);
6277 
6278   llvm_unreachable("Unexpected platform trying to use TLS");
6279 }
6280 
6281 // Looks through \param Val to determine the bit that can be used to
6282 // check the sign of the value. It returns the unextended value and
6283 // the sign bit position.
6284 std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
6285   if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
6286     return {Val.getOperand(0),
6287             cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
6288                 1};
6289 
6290   if (Val.getOpcode() == ISD::SIGN_EXTEND)
6291     return {Val.getOperand(0),
6292             Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
6293 
6294   return {Val, Val.getValueSizeInBits() - 1};
6295 }
6296 
6297 SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
6298   SDValue Chain = Op.getOperand(0);
6299   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
6300   SDValue LHS = Op.getOperand(2);
6301   SDValue RHS = Op.getOperand(3);
6302   SDValue Dest = Op.getOperand(4);
6303   SDLoc dl(Op);
6304 
6305   MachineFunction &MF = DAG.getMachineFunction();
6306   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
6307   // will not be produced, as they are conditional branch instructions that do
6308   // not set flags.
6309   bool ProduceNonFlagSettingCondBr =
6310       !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
6311 
6312   // Handle f128 first, since lowering it will result in comparing the return
6313   // value of a libcall against zero, which is just what the rest of LowerBR_CC
6314   // is expecting to deal with.
6315   if (LHS.getValueType() == MVT::f128) {
6316     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
6317 
6318     // If softenSetCCOperands returned a scalar, we need to compare the result
6319     // against zero to select between true and false values.
6320     if (!RHS.getNode()) {
6321       RHS = DAG.getConstant(0, dl, LHS.getValueType());
6322       CC = ISD::SETNE;
6323     }
6324   }
6325 
6326   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
6327   // instruction.
6328   if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
6329       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
6330     // Only lower legal XALUO ops.
6331     if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
6332       return SDValue();
6333 
6334     // The actual operation with overflow check.
6335     AArch64CC::CondCode OFCC;
6336     SDValue Value, Overflow;
6337     std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
6338 
6339     if (CC == ISD::SETNE)
6340       OFCC = getInvertedCondCode(OFCC);
6341     SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
6342 
6343     return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
6344                        Overflow);
6345   }
6346 
6347   if (LHS.getValueType().isInteger()) {
6348     assert((LHS.getValueType() == RHS.getValueType()) &&
6349            (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
6350 
6351     // If the RHS of the comparison is zero, we can potentially fold this
6352     // to a specialized branch.
6353     const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
6354     if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
6355       if (CC == ISD::SETEQ) {
6356         // See if we can use a TBZ to fold in an AND as well.
6357         // TBZ has a smaller branch displacement than CBZ.  If the offset is
6358         // out of bounds, a late MI-layer pass rewrites branches.
6359         // 403.gcc is an example that hits this case.
6360         if (LHS.getOpcode() == ISD::AND &&
6361             isa<ConstantSDNode>(LHS.getOperand(1)) &&
6362             isPowerOf2_64(LHS.getConstantOperandVal(1))) {
6363           SDValue Test = LHS.getOperand(0);
6364           uint64_t Mask = LHS.getConstantOperandVal(1);
6365           return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
6366                              DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
6367                              Dest);
6368         }
6369 
6370         return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
6371       } else if (CC == ISD::SETNE) {
6372         // See if we can use a TBZ to fold in an AND as well.
6373         // TBZ has a smaller branch displacement than CBZ.  If the offset is
6374         // out of bounds, a late MI-layer pass rewrites branches.
6375         // 403.gcc is an example that hits this case.
6376         if (LHS.getOpcode() == ISD::AND &&
6377             isa<ConstantSDNode>(LHS.getOperand(1)) &&
6378             isPowerOf2_64(LHS.getConstantOperandVal(1))) {
6379           SDValue Test = LHS.getOperand(0);
6380           uint64_t Mask = LHS.getConstantOperandVal(1);
6381           return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
6382                              DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
6383                              Dest);
6384         }
6385 
6386         return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
6387       } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
6388         // Don't combine AND since emitComparison converts the AND to an ANDS
6389         // (a.k.a. TST) and the test in the test bit and branch instruction
6390         // becomes redundant.  This would also increase register pressure.
6391         uint64_t SignBitPos;
6392         std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
6393         return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
6394                            DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
6395       }
6396     }
6397     if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
6398         LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
6399       // Don't combine AND since emitComparison converts the AND to an ANDS
6400       // (a.k.a. TST) and the test in the test bit and branch instruction
6401       // becomes redundant.  This would also increase register pressure.
6402       uint64_t SignBitPos;
6403       std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
6404       return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
6405                          DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
6406     }
6407 
6408     SDValue CCVal;
6409     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
6410     return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
6411                        Cmp);
6412   }
6413 
6414   assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
6415          LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
6416 
6417   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
6418   // clean.  Some of them require two branches to implement.
6419   SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
6420   AArch64CC::CondCode CC1, CC2;
6421   changeFPCCToAArch64CC(CC, CC1, CC2);
6422   SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
6423   SDValue BR1 =
6424       DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
6425   if (CC2 != AArch64CC::AL) {
6426     SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
6427     return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
6428                        Cmp);
6429   }
6430 
6431   return BR1;
6432 }
6433 
6434 SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
6435                                               SelectionDAG &DAG) const {
6436   EVT VT = Op.getValueType();
6437   SDLoc DL(Op);
6438 
6439   SDValue In1 = Op.getOperand(0);
6440   SDValue In2 = Op.getOperand(1);
6441   EVT SrcVT = In2.getValueType();
6442 
6443   if (SrcVT.bitsLT(VT))
6444     In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
6445   else if (SrcVT.bitsGT(VT))
6446     In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL));
6447 
6448   EVT VecVT;
6449   uint64_t EltMask;
6450   SDValue VecVal1, VecVal2;
6451 
6452   auto setVecVal = [&] (int Idx) {
6453     if (!VT.isVector()) {
6454       VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
6455                                           DAG.getUNDEF(VecVT), In1);
6456       VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
6457                                           DAG.getUNDEF(VecVT), In2);
6458     } else {
6459       VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
6460       VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
6461     }
6462   };
6463 
6464   if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
6465     VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32);
6466     EltMask = 0x80000000ULL;
6467     setVecVal(AArch64::ssub);
6468   } else if (VT == MVT::f64 || VT == MVT::v2f64) {
6469     VecVT = MVT::v2i64;
6470 
6471     // We want to materialize a mask with the high bit set, but the AdvSIMD
6472     // immediate moves cannot materialize that in a single instruction for
6473     // 64-bit elements. Instead, materialize zero and then negate it.
6474     EltMask = 0;
6475 
6476     setVecVal(AArch64::dsub);
6477   } else if (VT == MVT::f16 || VT == MVT::v4f16 || VT == MVT::v8f16) {
6478     VecVT = (VT == MVT::v4f16 ? MVT::v4i16 : MVT::v8i16);
6479     EltMask = 0x8000ULL;
6480     setVecVal(AArch64::hsub);
6481   } else {
6482     llvm_unreachable("Invalid type for copysign!");
6483   }
6484 
6485   SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT);
6486 
6487   // If we couldn't materialize the mask above, then the mask vector will be
6488   // the zero vector, and we need to negate it here.
6489   if (VT == MVT::f64 || VT == MVT::v2f64) {
6490     BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
6491     BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
6492     BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
6493   }
6494 
6495   SDValue Sel =
6496       DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);
6497 
6498   if (VT == MVT::f16)
6499     return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Sel);
6500   if (VT == MVT::f32)
6501     return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
6502   else if (VT == MVT::f64)
6503     return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
6504   else
6505     return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
6506 }
6507 
6508 SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
6509   if (DAG.getMachineFunction().getFunction().hasFnAttribute(
6510           Attribute::NoImplicitFloat))
6511     return SDValue();
6512 
6513   if (!Subtarget->hasNEON())
6514     return SDValue();
6515 
6516   // While there is no integer popcount instruction, it can
6517   // be more efficiently lowered to the following sequence that uses
6518   // AdvSIMD registers/instructions as long as the copies to/from
6519   // the AdvSIMD registers are cheap.
6520   //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
6521   //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
6522   //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
6523   //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
6524   SDValue Val = Op.getOperand(0);
6525   SDLoc DL(Op);
6526   EVT VT = Op.getValueType();
6527 
6528   if (VT == MVT::i32 || VT == MVT::i64) {
6529     if (VT == MVT::i32)
6530       Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
6531     Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
6532 
6533     SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
6534     SDValue UaddLV = DAG.getNode(
6535         ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
6536         DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
6537 
6538     if (VT == MVT::i64)
6539       UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
6540     return UaddLV;
6541   } else if (VT == MVT::i128) {
6542     Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
6543 
6544     SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
6545     SDValue UaddLV = DAG.getNode(
6546         ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
6547         DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
6548 
6549     return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
6550   }
6551 
6552   if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT))
6553     return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
6554 
6555   assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
6556           VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
6557          "Unexpected type for custom ctpop lowering");
6558 
6559   EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
6560   Val = DAG.getBitcast(VT8Bit, Val);
6561   Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
6562 
6563   // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
6564   unsigned EltSize = 8;
6565   unsigned NumElts = VT.is64BitVector() ? 8 : 16;
6566   while (EltSize != VT.getScalarSizeInBits()) {
6567     EltSize *= 2;
6568     NumElts /= 2;
6569     MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
6570     Val = DAG.getNode(
6571         ISD::INTRINSIC_WO_CHAIN, DL, WidenVT,
6572         DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
6573   }
6574 
6575   return Val;
6576 }
6577 
6578 SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
6579   EVT VT = Op.getValueType();
6580   assert(VT.isScalableVector() ||
6581          useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true));
6582 
6583   SDLoc DL(Op);
6584   SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
6585   return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
6586 }
6587 
6588 SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
6589 
6590   if (Op.getValueType().isVector())
6591     return LowerVSETCC(Op, DAG);
6592 
6593   bool IsStrict = Op->isStrictFPOpcode();
6594   bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
6595   unsigned OpNo = IsStrict ? 1 : 0;
6596   SDValue Chain;
6597   if (IsStrict)
6598     Chain = Op.getOperand(0);
6599   SDValue LHS = Op.getOperand(OpNo + 0);
6600   SDValue RHS = Op.getOperand(OpNo + 1);
6601   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
6602   SDLoc dl(Op);
6603 
6604   // We chose ZeroOrOneBooleanContents, so use zero and one.
6605   EVT VT = Op.getValueType();
6606   SDValue TVal = DAG.getConstant(1, dl, VT);
6607   SDValue FVal = DAG.getConstant(0, dl, VT);
6608 
6609   // Handle f128 first, since one possible outcome is a normal integer
6610   // comparison which gets picked up by the next if statement.
6611   if (LHS.getValueType() == MVT::f128) {
6612     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
6613                         IsSignaling);
6614 
6615     // If softenSetCCOperands returned a scalar, use it.
6616     if (!RHS.getNode()) {
6617       assert(LHS.getValueType() == Op.getValueType() &&
6618              "Unexpected setcc expansion!");
6619       return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
6620     }
6621   }
6622 
6623   if (LHS.getValueType().isInteger()) {
6624     SDValue CCVal;
6625     SDValue Cmp = getAArch64Cmp(
6626         LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
6627 
6628     // Note that we inverted the condition above, so we reverse the order of
6629     // the true and false operands here.  This will allow the setcc to be
6630     // matched to a single CSINC instruction.
6631     SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
6632     return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
6633   }
6634 
6635   // Now we know we're dealing with FP values.
6636   assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
6637          LHS.getValueType() == MVT::f64);
6638 
6639   // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
6640   // and do the comparison.
6641   SDValue Cmp;
6642   if (IsStrict)
6643     Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
6644   else
6645     Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
6646 
6647   AArch64CC::CondCode CC1, CC2;
6648   changeFPCCToAArch64CC(CC, CC1, CC2);
6649   SDValue Res;
6650   if (CC2 == AArch64CC::AL) {
6651     changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
6652                           CC2);
6653     SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
6654 
6655     // Note that we inverted the condition above, so we reverse the order of
6656     // the true and false operands here.  This will allow the setcc to be
6657     // matched to a single CSINC instruction.
6658     Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
6659   } else {
6660     // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
6661     // totally clean.  Some of them require two CSELs to implement.  As is in
6662     // this case, we emit the first CSEL and then emit a second using the output
6663     // of the first as the RHS.  We're effectively OR'ing the two CC's together.
6664 
6665     // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
6666     SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
6667     SDValue CS1 =
6668         DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
6669 
6670     SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
6671     Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
6672   }
6673   return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
6674 }
6675 
6676 SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
6677                                               SDValue RHS, SDValue TVal,
6678                                               SDValue FVal, const SDLoc &dl,
6679                                               SelectionDAG &DAG) const {
6680   // Handle f128 first, because it will result in a comparison of some RTLIB
6681   // call result against zero.
6682   if (LHS.getValueType() == MVT::f128) {
6683     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
6684 
6685     // If softenSetCCOperands returned a scalar, we need to compare the result
6686     // against zero to select between true and false values.
6687     if (!RHS.getNode()) {
6688       RHS = DAG.getConstant(0, dl, LHS.getValueType());
6689       CC = ISD::SETNE;
6690     }
6691   }
6692 
6693   // Also handle f16, for which we need to do a f32 comparison.
6694   if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
6695     LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
6696     RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
6697   }
6698 
6699   // Next, handle integers.
6700   if (LHS.getValueType().isInteger()) {
6701     assert((LHS.getValueType() == RHS.getValueType()) &&
6702            (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
6703 
6704     unsigned Opcode = AArch64ISD::CSEL;
6705 
6706     // If both the TVal and the FVal are constants, see if we can swap them in
6707     // order to for a CSINV or CSINC out of them.
6708     ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
6709     ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
6710 
6711     if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
6712       std::swap(TVal, FVal);
6713       std::swap(CTVal, CFVal);
6714       CC = ISD::getSetCCInverse(CC, LHS.getValueType());
6715     } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
6716       std::swap(TVal, FVal);
6717       std::swap(CTVal, CFVal);
6718       CC = ISD::getSetCCInverse(CC, LHS.getValueType());
6719     } else if (TVal.getOpcode() == ISD::XOR) {
6720       // If TVal is a NOT we want to swap TVal and FVal so that we can match
6721       // with a CSINV rather than a CSEL.
6722       if (isAllOnesConstant(TVal.getOperand(1))) {
6723         std::swap(TVal, FVal);
6724         std::swap(CTVal, CFVal);
6725         CC = ISD::getSetCCInverse(CC, LHS.getValueType());
6726       }
6727     } else if (TVal.getOpcode() == ISD::SUB) {
6728       // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
6729       // that we can match with a CSNEG rather than a CSEL.
6730       if (isNullConstant(TVal.getOperand(0))) {
6731         std::swap(TVal, FVal);
6732         std::swap(CTVal, CFVal);
6733         CC = ISD::getSetCCInverse(CC, LHS.getValueType());
6734       }
6735     } else if (CTVal && CFVal) {
6736       const int64_t TrueVal = CTVal->getSExtValue();
6737       const int64_t FalseVal = CFVal->getSExtValue();
6738       bool Swap = false;
6739 
6740       // If both TVal and FVal are constants, see if FVal is the
6741       // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
6742       // instead of a CSEL in that case.
6743       if (TrueVal == ~FalseVal) {
6744         Opcode = AArch64ISD::CSINV;
6745       } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
6746                  TrueVal == -FalseVal) {
6747         Opcode = AArch64ISD::CSNEG;
6748       } else if (TVal.getValueType() == MVT::i32) {
6749         // If our operands are only 32-bit wide, make sure we use 32-bit
6750         // arithmetic for the check whether we can use CSINC. This ensures that
6751         // the addition in the check will wrap around properly in case there is
6752         // an overflow (which would not be the case if we do the check with
6753         // 64-bit arithmetic).
6754         const uint32_t TrueVal32 = CTVal->getZExtValue();
6755         const uint32_t FalseVal32 = CFVal->getZExtValue();
6756 
6757         if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
6758           Opcode = AArch64ISD::CSINC;
6759 
6760           if (TrueVal32 > FalseVal32) {
6761             Swap = true;
6762           }
6763         }
6764         // 64-bit check whether we can use CSINC.
6765       } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) {
6766         Opcode = AArch64ISD::CSINC;
6767 
6768         if (TrueVal > FalseVal) {
6769           Swap = true;
6770         }
6771       }
6772 
6773       // Swap TVal and FVal if necessary.
6774       if (Swap) {
6775         std::swap(TVal, FVal);
6776         std::swap(CTVal, CFVal);
6777         CC = ISD::getSetCCInverse(CC, LHS.getValueType());
6778       }
6779 
6780       if (Opcode != AArch64ISD::CSEL) {
6781         // Drop FVal since we can get its value by simply inverting/negating
6782         // TVal.
6783         FVal = TVal;
6784       }
6785     }
6786 
6787     // Avoid materializing a constant when possible by reusing a known value in
6788     // a register.  However, don't perform this optimization if the known value
6789     // is one, zero or negative one in the case of a CSEL.  We can always
6790     // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
6791     // FVal, respectively.
6792     ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
6793     if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
6794         !RHSVal->isNullValue() && !RHSVal->isAllOnesValue()) {
6795       AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
6796       // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
6797       // "a != C ? x : a" to avoid materializing C.
6798       if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
6799         TVal = LHS;
6800       else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
6801         FVal = LHS;
6802     } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
6803       assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
6804       // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
6805       // avoid materializing C.
6806       AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
6807       if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
6808         Opcode = AArch64ISD::CSINV;
6809         TVal = LHS;
6810         FVal = DAG.getConstant(0, dl, FVal.getValueType());
6811       }
6812     }
6813 
6814     SDValue CCVal;
6815     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
6816     EVT VT = TVal.getValueType();
6817     return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
6818   }
6819 
6820   // Now we know we're dealing with FP values.
6821   assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
6822          LHS.getValueType() == MVT::f64);
6823   assert(LHS.getValueType() == RHS.getValueType());
6824   EVT VT = TVal.getValueType();
6825   SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
6826 
6827   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
6828   // clean.  Some of them require two CSELs to implement.
6829   AArch64CC::CondCode CC1, CC2;
6830   changeFPCCToAArch64CC(CC, CC1, CC2);
6831 
6832   if (DAG.getTarget().Options.UnsafeFPMath) {
6833     // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
6834     // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
6835     ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
6836     if (RHSVal && RHSVal->isZero()) {
6837       ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
6838       ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
6839 
6840       if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
6841           CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
6842         TVal = LHS;
6843       else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
6844                CFVal && CFVal->isZero() &&
6845                FVal.getValueType() == LHS.getValueType())
6846         FVal = LHS;
6847     }
6848   }
6849 
6850   // Emit first, and possibly only, CSEL.
6851   SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
6852   SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
6853 
6854   // If we need a second CSEL, emit it, using the output of the first as the
6855   // RHS.  We're effectively OR'ing the two CC's together.
6856   if (CC2 != AArch64CC::AL) {
6857     SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
6858     return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
6859   }
6860 
6861   // Otherwise, return the output of the first CSEL.
6862   return CS1;
6863 }
6864 
6865 SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
6866                                               SelectionDAG &DAG) const {
6867   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
6868   SDValue LHS = Op.getOperand(0);
6869   SDValue RHS = Op.getOperand(1);
6870   SDValue TVal = Op.getOperand(2);
6871   SDValue FVal = Op.getOperand(3);
6872   SDLoc DL(Op);
6873   return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
6874 }
6875 
6876 SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
6877                                            SelectionDAG &DAG) const {
6878   SDValue CCVal = Op->getOperand(0);
6879   SDValue TVal = Op->getOperand(1);
6880   SDValue FVal = Op->getOperand(2);
6881   SDLoc DL(Op);
6882 
6883   EVT Ty = Op.getValueType();
6884   if (Ty.isScalableVector()) {
6885     SDValue TruncCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, CCVal);
6886     MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
6887     SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, TruncCC);
6888     return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
6889   }
6890 
6891   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
6892   // instruction.
6893   if (ISD::isOverflowIntrOpRes(CCVal)) {
6894     // Only lower legal XALUO ops.
6895     if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
6896       return SDValue();
6897 
6898     AArch64CC::CondCode OFCC;
6899     SDValue Value, Overflow;
6900     std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
6901     SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
6902 
6903     return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
6904                        CCVal, Overflow);
6905   }
6906 
6907   // Lower it the same way as we would lower a SELECT_CC node.
6908   ISD::CondCode CC;
6909   SDValue LHS, RHS;
6910   if (CCVal.getOpcode() == ISD::SETCC) {
6911     LHS = CCVal.getOperand(0);
6912     RHS = CCVal.getOperand(1);
6913     CC = cast<CondCodeSDNode>(CCVal->getOperand(2))->get();
6914   } else {
6915     LHS = CCVal;
6916     RHS = DAG.getConstant(0, DL, CCVal.getValueType());
6917     CC = ISD::SETNE;
6918   }
6919   return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
6920 }
6921 
6922 SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
6923                                               SelectionDAG &DAG) const {
6924   // Jump table entries as PC relative offsets. No additional tweaking
6925   // is necessary here. Just get the address of the jump table.
6926   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
6927 
6928   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
6929       !Subtarget->isTargetMachO()) {
6930     return getAddrLarge(JT, DAG);
6931   } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
6932     return getAddrTiny(JT, DAG);
6933   }
6934   return getAddr(JT, DAG);
6935 }
6936 
6937 SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
6938                                           SelectionDAG &DAG) const {
6939   // Jump table entries as PC relative offsets. No additional tweaking
6940   // is necessary here. Just get the address of the jump table.
6941   SDLoc DL(Op);
6942   SDValue JT = Op.getOperand(1);
6943   SDValue Entry = Op.getOperand(2);
6944   int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
6945 
6946   auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
6947   AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
6948 
6949   SDNode *Dest =
6950       DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
6951                          Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
6952   return DAG.getNode(ISD::BRIND, DL, MVT::Other, Op.getOperand(0),
6953                      SDValue(Dest, 0));
6954 }
6955 
6956 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
6957                                                  SelectionDAG &DAG) const {
6958   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
6959 
6960   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
6961     // Use the GOT for the large code model on iOS.
6962     if (Subtarget->isTargetMachO()) {
6963       return getGOT(CP, DAG);
6964     }
6965     return getAddrLarge(CP, DAG);
6966   } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
6967     return getAddrTiny(CP, DAG);
6968   } else {
6969     return getAddr(CP, DAG);
6970   }
6971 }
6972 
6973 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
6974                                                SelectionDAG &DAG) const {
6975   BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
6976   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
6977       !Subtarget->isTargetMachO()) {
6978     return getAddrLarge(BA, DAG);
6979   } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
6980     return getAddrTiny(BA, DAG);
6981   }
6982   return getAddr(BA, DAG);
6983 }
6984 
6985 SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
6986                                                  SelectionDAG &DAG) const {
6987   AArch64FunctionInfo *FuncInfo =
6988       DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
6989 
6990   SDLoc DL(Op);
6991   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
6992                                  getPointerTy(DAG.getDataLayout()));
6993   FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
6994   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
6995   return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
6996                       MachinePointerInfo(SV));
6997 }
6998 
6999 SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
7000                                                   SelectionDAG &DAG) const {
7001   AArch64FunctionInfo *FuncInfo =
7002       DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
7003 
7004   SDLoc DL(Op);
7005   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
7006                                      ? FuncInfo->getVarArgsGPRIndex()
7007                                      : FuncInfo->getVarArgsStackIndex(),
7008                                  getPointerTy(DAG.getDataLayout()));
7009   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
7010   return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
7011                       MachinePointerInfo(SV));
7012 }
7013 
7014 SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
7015                                                   SelectionDAG &DAG) const {
7016   // The layout of the va_list struct is specified in the AArch64 Procedure Call
7017   // Standard, section B.3.
7018   MachineFunction &MF = DAG.getMachineFunction();
7019   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7020   unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
7021   auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
7022   auto PtrVT = getPointerTy(DAG.getDataLayout());
7023   SDLoc DL(Op);
7024 
7025   SDValue Chain = Op.getOperand(0);
7026   SDValue VAList = Op.getOperand(1);
7027   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
7028   SmallVector<SDValue, 4> MemOps;
7029 
7030   // void *__stack at offset 0
7031   unsigned Offset = 0;
7032   SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
7033   Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
7034   MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
7035                                 MachinePointerInfo(SV), Align(PtrSize)));
7036 
7037   // void *__gr_top at offset 8 (4 on ILP32)
7038   Offset += PtrSize;
7039   int GPRSize = FuncInfo->getVarArgsGPRSize();
7040   if (GPRSize > 0) {
7041     SDValue GRTop, GRTopAddr;
7042 
7043     GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
7044                             DAG.getConstant(Offset, DL, PtrVT));
7045 
7046     GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
7047     GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
7048                         DAG.getConstant(GPRSize, DL, PtrVT));
7049     GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
7050 
7051     MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
7052                                   MachinePointerInfo(SV, Offset),
7053                                   Align(PtrSize)));
7054   }
7055 
7056   // void *__vr_top at offset 16 (8 on ILP32)
7057   Offset += PtrSize;
7058   int FPRSize = FuncInfo->getVarArgsFPRSize();
7059   if (FPRSize > 0) {
7060     SDValue VRTop, VRTopAddr;
7061     VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
7062                             DAG.getConstant(Offset, DL, PtrVT));
7063 
7064     VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
7065     VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
7066                         DAG.getConstant(FPRSize, DL, PtrVT));
7067     VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
7068 
7069     MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
7070                                   MachinePointerInfo(SV, Offset),
7071                                   Align(PtrSize)));
7072   }
7073 
7074   // int __gr_offs at offset 24 (12 on ILP32)
7075   Offset += PtrSize;
7076   SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
7077                                    DAG.getConstant(Offset, DL, PtrVT));
7078   MemOps.push_back(
7079       DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32),
7080                    GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
7081 
7082   // int __vr_offs at offset 28 (16 on ILP32)
7083   Offset += 4;
7084   SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
7085                                    DAG.getConstant(Offset, DL, PtrVT));
7086   MemOps.push_back(
7087       DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32),
7088                    VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
7089 
7090   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
7091 }
7092 
7093 SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
7094                                             SelectionDAG &DAG) const {
7095   MachineFunction &MF = DAG.getMachineFunction();
7096 
7097   if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
7098     return LowerWin64_VASTART(Op, DAG);
7099   else if (Subtarget->isTargetDarwin())
7100     return LowerDarwin_VASTART(Op, DAG);
7101   else
7102     return LowerAAPCS_VASTART(Op, DAG);
7103 }
7104 
7105 SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
7106                                            SelectionDAG &DAG) const {
7107   // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
7108   // pointer.
7109   SDLoc DL(Op);
7110   unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
7111   unsigned VaListSize =
7112       (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
7113           ? PtrSize
7114           : Subtarget->isTargetILP32() ? 20 : 32;
7115   const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
7116   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
7117 
7118   return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
7119                        DAG.getConstant(VaListSize, DL, MVT::i32),
7120                        Align(PtrSize), false, false, false,
7121                        MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
7122 }
7123 
7124 SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
7125   assert(Subtarget->isTargetDarwin() &&
7126          "automatic va_arg instruction only works on Darwin");
7127 
7128   const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
7129   EVT VT = Op.getValueType();
7130   SDLoc DL(Op);
7131   SDValue Chain = Op.getOperand(0);
7132   SDValue Addr = Op.getOperand(1);
7133   MaybeAlign Align(Op.getConstantOperandVal(3));
7134   unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
7135   auto PtrVT = getPointerTy(DAG.getDataLayout());
7136   auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
7137   SDValue VAList =
7138       DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
7139   Chain = VAList.getValue(1);
7140   VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
7141 
7142   if (VT.isScalableVector())
7143     report_fatal_error("Passing SVE types to variadic functions is "
7144                        "currently not supported");
7145 
7146   if (Align && *Align > MinSlotSize) {
7147     VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
7148                          DAG.getConstant(Align->value() - 1, DL, PtrVT));
7149     VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
7150                          DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
7151   }
7152 
7153   Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
7154   unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
7155 
7156   // Scalar integer and FP values smaller than 64 bits are implicitly extended
7157   // up to 64 bits.  At the very least, we have to increase the striding of the
7158   // vaargs list to match this, and for FP values we need to introduce
7159   // FP_ROUND nodes as well.
7160   if (VT.isInteger() && !VT.isVector())
7161     ArgSize = std::max(ArgSize, MinSlotSize);
7162   bool NeedFPTrunc = false;
7163   if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
7164     ArgSize = 8;
7165     NeedFPTrunc = true;
7166   }
7167 
7168   // Increment the pointer, VAList, to the next vaarg
7169   SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
7170                                DAG.getConstant(ArgSize, DL, PtrVT));
7171   VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
7172 
7173   // Store the incremented VAList to the legalized pointer
7174   SDValue APStore =
7175       DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
7176 
7177   // Load the actual argument out of the pointer VAList
7178   if (NeedFPTrunc) {
7179     // Load the value as an f64.
7180     SDValue WideFP =
7181         DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
7182     // Round the value down to an f32.
7183     SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
7184                                    DAG.getIntPtrConstant(1, DL));
7185     SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
7186     // Merge the rounded value with the chain output of the load.
7187     return DAG.getMergeValues(Ops, DL);
7188   }
7189 
7190   return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
7191 }
7192 
7193 SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
7194                                               SelectionDAG &DAG) const {
7195   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
7196   MFI.setFrameAddressIsTaken(true);
7197 
7198   EVT VT = Op.getValueType();
7199   SDLoc DL(Op);
7200   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
7201   SDValue FrameAddr =
7202       DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
7203   while (Depth--)
7204     FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
7205                             MachinePointerInfo());
7206 
7207   if (Subtarget->isTargetILP32())
7208     FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
7209                             DAG.getValueType(VT));
7210 
7211   return FrameAddr;
7212 }
7213 
7214 SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
7215                                               SelectionDAG &DAG) const {
7216   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
7217 
7218   EVT VT = getPointerTy(DAG.getDataLayout());
7219   SDLoc DL(Op);
7220   int FI = MFI.CreateFixedObject(4, 0, false);
7221   return DAG.getFrameIndex(FI, VT);
7222 }
7223 
7224 #define GET_REGISTER_MATCHER
7225 #include "AArch64GenAsmMatcher.inc"
7226 
7227 // FIXME? Maybe this could be a TableGen attribute on some registers and
7228 // this table could be generated automatically from RegInfo.
7229 Register AArch64TargetLowering::
7230 getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
7231   Register Reg = MatchRegisterName(RegName);
7232   if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
7233     const MCRegisterInfo *MRI = Subtarget->getRegisterInfo();
7234     unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
7235     if (!Subtarget->isXRegisterReserved(DwarfRegNum))
7236       Reg = 0;
7237   }
7238   if (Reg)
7239     return Reg;
7240   report_fatal_error(Twine("Invalid register name \""
7241                               + StringRef(RegName)  + "\"."));
7242 }
7243 
7244 SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
7245                                                      SelectionDAG &DAG) const {
7246   DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true);
7247 
7248   EVT VT = Op.getValueType();
7249   SDLoc DL(Op);
7250 
7251   SDValue FrameAddr =
7252       DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
7253   SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
7254 
7255   return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
7256 }
7257 
7258 SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
7259                                                SelectionDAG &DAG) const {
7260   MachineFunction &MF = DAG.getMachineFunction();
7261   MachineFrameInfo &MFI = MF.getFrameInfo();
7262   MFI.setReturnAddressIsTaken(true);
7263 
7264   EVT VT = Op.getValueType();
7265   SDLoc DL(Op);
7266   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
7267   SDValue ReturnAddress;
7268   if (Depth) {
7269     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
7270     SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
7271     ReturnAddress = DAG.getLoad(
7272         VT, DL, DAG.getEntryNode(),
7273         DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
7274   } else {
7275     // Return LR, which contains the return address. Mark it an implicit
7276     // live-in.
7277     unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
7278     ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7279   }
7280 
7281   // The XPACLRI instruction assembles to a hint-space instruction before
7282   // Armv8.3-A therefore this instruction can be safely used for any pre
7283   // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
7284   // that instead.
7285   SDNode *St;
7286   if (Subtarget->hasPAuth()) {
7287     St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
7288   } else {
7289     // XPACLRI operates on LR therefore we must move the operand accordingly.
7290     SDValue Chain =
7291         DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
7292     St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
7293   }
7294   return SDValue(St, 0);
7295 }
7296 
7297 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two
7298 /// i64 values and take a 2 x i64 value to shift plus a shift amount.
7299 SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
7300                                                     SelectionDAG &DAG) const {
7301   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
7302   EVT VT = Op.getValueType();
7303   unsigned VTBits = VT.getSizeInBits();
7304   SDLoc dl(Op);
7305   SDValue ShOpLo = Op.getOperand(0);
7306   SDValue ShOpHi = Op.getOperand(1);
7307   SDValue ShAmt = Op.getOperand(2);
7308   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
7309 
7310   assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
7311 
7312   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
7313                                  DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
7314   SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
7315 
7316   // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which
7317   // is "undef". We wanted 0, so CSEL it directly.
7318   SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
7319                                ISD::SETEQ, dl, DAG);
7320   SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
7321   HiBitsForLo =
7322       DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
7323                   HiBitsForLo, CCVal, Cmp);
7324 
7325   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
7326                                    DAG.getConstant(VTBits, dl, MVT::i64));
7327 
7328   SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
7329   SDValue LoForNormalShift =
7330       DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo);
7331 
7332   Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
7333                        dl, DAG);
7334   CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
7335   SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
7336   SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
7337                            LoForNormalShift, CCVal, Cmp);
7338 
7339   // AArch64 shifts larger than the register width are wrapped rather than
7340   // clamped, so we can't just emit "hi >> x".
7341   SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
7342   SDValue HiForBigShift =
7343       Opc == ISD::SRA
7344           ? DAG.getNode(Opc, dl, VT, ShOpHi,
7345                         DAG.getConstant(VTBits - 1, dl, MVT::i64))
7346           : DAG.getConstant(0, dl, VT);
7347   SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
7348                            HiForNormalShift, CCVal, Cmp);
7349 
7350   SDValue Ops[2] = { Lo, Hi };
7351   return DAG.getMergeValues(Ops, dl);
7352 }
7353 
7354 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
7355 /// i64 values and take a 2 x i64 value to shift plus a shift amount.
7356 SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
7357                                                    SelectionDAG &DAG) const {
7358   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
7359   EVT VT = Op.getValueType();
7360   unsigned VTBits = VT.getSizeInBits();
7361   SDLoc dl(Op);
7362   SDValue ShOpLo = Op.getOperand(0);
7363   SDValue ShOpHi = Op.getOperand(1);
7364   SDValue ShAmt = Op.getOperand(2);
7365 
7366   assert(Op.getOpcode() == ISD::SHL_PARTS);
7367   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
7368                                  DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
7369   SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
7370 
7371   // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which
7372   // is "undef". We wanted 0, so CSEL it directly.
7373   SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
7374                                ISD::SETEQ, dl, DAG);
7375   SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
7376   LoBitsForHi =
7377       DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
7378                   LoBitsForHi, CCVal, Cmp);
7379 
7380   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
7381                                    DAG.getConstant(VTBits, dl, MVT::i64));
7382   SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
7383   SDValue HiForNormalShift =
7384       DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi);
7385 
7386   SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
7387 
7388   Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
7389                        dl, DAG);
7390   CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
7391   SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
7392                            HiForNormalShift, CCVal, Cmp);
7393 
7394   // AArch64 shifts of larger than register sizes are wrapped rather than
7395   // clamped, so we can't just emit "lo << a" if a is too big.
7396   SDValue LoForBigShift = DAG.getConstant(0, dl, VT);
7397   SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
7398   SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
7399                            LoForNormalShift, CCVal, Cmp);
7400 
7401   SDValue Ops[2] = { Lo, Hi };
7402   return DAG.getMergeValues(Ops, dl);
7403 }
7404 
7405 bool AArch64TargetLowering::isOffsetFoldingLegal(
7406     const GlobalAddressSDNode *GA) const {
7407   // Offsets are folded in the DAG combine rather than here so that we can
7408   // intelligently choose an offset based on the uses.
7409   return false;
7410 }
7411 
7412 bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
7413                                          bool OptForSize) const {
7414   bool IsLegal = false;
7415   // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
7416   // 16-bit case when target has full fp16 support.
7417   // FIXME: We should be able to handle f128 as well with a clever lowering.
7418   const APInt ImmInt = Imm.bitcastToAPInt();
7419   if (VT == MVT::f64)
7420     IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
7421   else if (VT == MVT::f32)
7422     IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
7423   else if (VT == MVT::f16 && Subtarget->hasFullFP16())
7424     IsLegal = AArch64_AM::getFP16Imm(ImmInt) != -1 || Imm.isPosZero();
7425   // TODO: fmov h0, w0 is also legal, however on't have an isel pattern to
7426   //       generate that fmov.
7427 
7428   // If we can not materialize in immediate field for fmov, check if the
7429   // value can be encoded as the immediate operand of a logical instruction.
7430   // The immediate value will be created with either MOVZ, MOVN, or ORR.
7431   if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
7432     // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
7433     // however the mov+fmov sequence is always better because of the reduced
7434     // cache pressure. The timings are still the same if you consider
7435     // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
7436     // movw+movk is fused). So we limit up to 2 instrdduction at most.
7437     SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
7438     AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(),
7439 			      Insn);
7440     unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
7441     IsLegal = Insn.size() <= Limit;
7442   }
7443 
7444   LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT.getEVTString()
7445                     << " imm value: "; Imm.dump(););
7446   return IsLegal;
7447 }
7448 
7449 //===----------------------------------------------------------------------===//
7450 //                          AArch64 Optimization Hooks
7451 //===----------------------------------------------------------------------===//
7452 
7453 static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
7454                            SDValue Operand, SelectionDAG &DAG,
7455                            int &ExtraSteps) {
7456   EVT VT = Operand.getValueType();
7457   if (ST->hasNEON() &&
7458       (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
7459        VT == MVT::f32 || VT == MVT::v1f32 ||
7460        VT == MVT::v2f32 || VT == MVT::v4f32)) {
7461     if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified)
7462       // For the reciprocal estimates, convergence is quadratic, so the number
7463       // of digits is doubled after each iteration.  In ARMv8, the accuracy of
7464       // the initial estimate is 2^-8.  Thus the number of extra steps to refine
7465       // the result for float (23 mantissa bits) is 2 and for double (52
7466       // mantissa bits) is 3.
7467       ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2;
7468 
7469     return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
7470   }
7471 
7472   return SDValue();
7473 }
7474 
7475 SDValue
7476 AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
7477                                         const DenormalMode &Mode) const {
7478   SDLoc DL(Op);
7479   EVT VT = Op.getValueType();
7480   EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
7481   SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
7482   return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
7483 }
7484 
7485 SDValue
7486 AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
7487                                                    SelectionDAG &DAG) const {
7488   return Op;
7489 }
7490 
7491 SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
7492                                                SelectionDAG &DAG, int Enabled,
7493                                                int &ExtraSteps,
7494                                                bool &UseOneConst,
7495                                                bool Reciprocal) const {
7496   if (Enabled == ReciprocalEstimate::Enabled ||
7497       (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
7498     if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
7499                                        DAG, ExtraSteps)) {
7500       SDLoc DL(Operand);
7501       EVT VT = Operand.getValueType();
7502 
7503       SDNodeFlags Flags;
7504       Flags.setAllowReassociation(true);
7505 
7506       // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
7507       // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
7508       for (int i = ExtraSteps; i > 0; --i) {
7509         SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
7510                                    Flags);
7511         Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
7512         Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
7513       }
7514       if (!Reciprocal)
7515         Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
7516 
7517       ExtraSteps = 0;
7518       return Estimate;
7519     }
7520 
7521   return SDValue();
7522 }
7523 
7524 SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
7525                                                 SelectionDAG &DAG, int Enabled,
7526                                                 int &ExtraSteps) const {
7527   if (Enabled == ReciprocalEstimate::Enabled)
7528     if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
7529                                        DAG, ExtraSteps)) {
7530       SDLoc DL(Operand);
7531       EVT VT = Operand.getValueType();
7532 
7533       SDNodeFlags Flags;
7534       Flags.setAllowReassociation(true);
7535 
7536       // Newton reciprocal iteration: E * (2 - X * E)
7537       // AArch64 reciprocal iteration instruction: (2 - M * N)
7538       for (int i = ExtraSteps; i > 0; --i) {
7539         SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
7540                                    Estimate, Flags);
7541         Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
7542       }
7543 
7544       ExtraSteps = 0;
7545       return Estimate;
7546     }
7547 
7548   return SDValue();
7549 }
7550 
7551 //===----------------------------------------------------------------------===//
7552 //                          AArch64 Inline Assembly Support
7553 //===----------------------------------------------------------------------===//
7554 
7555 // Table of Constraints
7556 // TODO: This is the current set of constraints supported by ARM for the
7557 // compiler, not all of them may make sense.
7558 //
7559 // r - A general register
7560 // w - An FP/SIMD register of some size in the range v0-v31
7561 // x - An FP/SIMD register of some size in the range v0-v15
7562 // I - Constant that can be used with an ADD instruction
7563 // J - Constant that can be used with a SUB instruction
7564 // K - Constant that can be used with a 32-bit logical instruction
7565 // L - Constant that can be used with a 64-bit logical instruction
7566 // M - Constant that can be used as a 32-bit MOV immediate
7567 // N - Constant that can be used as a 64-bit MOV immediate
7568 // Q - A memory reference with base register and no offset
7569 // S - A symbolic address
7570 // Y - Floating point constant zero
7571 // Z - Integer constant zero
7572 //
7573 //   Note that general register operands will be output using their 64-bit x
7574 // register name, whatever the size of the variable, unless the asm operand
7575 // is prefixed by the %w modifier. Floating-point and SIMD register operands
7576 // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
7577 // %q modifier.
7578 const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
7579   // At this point, we have to lower this constraint to something else, so we
7580   // lower it to an "r" or "w". However, by doing this we will force the result
7581   // to be in register, while the X constraint is much more permissive.
7582   //
7583   // Although we are correct (we are free to emit anything, without
7584   // constraints), we might break use cases that would expect us to be more
7585   // efficient and emit something else.
7586   if (!Subtarget->hasFPARMv8())
7587     return "r";
7588 
7589   if (ConstraintVT.isFloatingPoint())
7590     return "w";
7591 
7592   if (ConstraintVT.isVector() &&
7593      (ConstraintVT.getSizeInBits() == 64 ||
7594       ConstraintVT.getSizeInBits() == 128))
7595     return "w";
7596 
7597   return "r";
7598 }
7599 
7600 enum PredicateConstraint {
7601   Upl,
7602   Upa,
7603   Invalid
7604 };
7605 
7606 static PredicateConstraint parsePredicateConstraint(StringRef Constraint) {
7607   PredicateConstraint P = PredicateConstraint::Invalid;
7608   if (Constraint == "Upa")
7609     P = PredicateConstraint::Upa;
7610   if (Constraint == "Upl")
7611     P = PredicateConstraint::Upl;
7612   return P;
7613 }
7614 
7615 /// getConstraintType - Given a constraint letter, return the type of
7616 /// constraint it is for this target.
7617 AArch64TargetLowering::ConstraintType
7618 AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
7619   if (Constraint.size() == 1) {
7620     switch (Constraint[0]) {
7621     default:
7622       break;
7623     case 'x':
7624     case 'w':
7625     case 'y':
7626       return C_RegisterClass;
7627     // An address with a single base register. Due to the way we
7628     // currently handle addresses it is the same as 'r'.
7629     case 'Q':
7630       return C_Memory;
7631     case 'I':
7632     case 'J':
7633     case 'K':
7634     case 'L':
7635     case 'M':
7636     case 'N':
7637     case 'Y':
7638     case 'Z':
7639       return C_Immediate;
7640     case 'z':
7641     case 'S': // A symbolic address
7642       return C_Other;
7643     }
7644   } else if (parsePredicateConstraint(Constraint) !=
7645              PredicateConstraint::Invalid)
7646       return C_RegisterClass;
7647   return TargetLowering::getConstraintType(Constraint);
7648 }
7649 
7650 /// Examine constraint type and operand type and determine a weight value.
7651 /// This object must already have been set up with the operand type
7652 /// and the current alternative constraint selected.
7653 TargetLowering::ConstraintWeight
7654 AArch64TargetLowering::getSingleConstraintMatchWeight(
7655     AsmOperandInfo &info, const char *constraint) const {
7656   ConstraintWeight weight = CW_Invalid;
7657   Value *CallOperandVal = info.CallOperandVal;
7658   // If we don't have a value, we can't do a match,
7659   // but allow it at the lowest weight.
7660   if (!CallOperandVal)
7661     return CW_Default;
7662   Type *type = CallOperandVal->getType();
7663   // Look at the constraint type.
7664   switch (*constraint) {
7665   default:
7666     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
7667     break;
7668   case 'x':
7669   case 'w':
7670   case 'y':
7671     if (type->isFloatingPointTy() || type->isVectorTy())
7672       weight = CW_Register;
7673     break;
7674   case 'z':
7675     weight = CW_Constant;
7676     break;
7677   case 'U':
7678     if (parsePredicateConstraint(constraint) != PredicateConstraint::Invalid)
7679       weight = CW_Register;
7680     break;
7681   }
7682   return weight;
7683 }
7684 
7685 std::pair<unsigned, const TargetRegisterClass *>
7686 AArch64TargetLowering::getRegForInlineAsmConstraint(
7687     const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
7688   if (Constraint.size() == 1) {
7689     switch (Constraint[0]) {
7690     case 'r':
7691       if (VT.isScalableVector())
7692         return std::make_pair(0U, nullptr);
7693       if (VT.getFixedSizeInBits() == 64)
7694         return std::make_pair(0U, &AArch64::GPR64commonRegClass);
7695       return std::make_pair(0U, &AArch64::GPR32commonRegClass);
7696     case 'w': {
7697       if (!Subtarget->hasFPARMv8())
7698         break;
7699       if (VT.isScalableVector()) {
7700         if (VT.getVectorElementType() != MVT::i1)
7701           return std::make_pair(0U, &AArch64::ZPRRegClass);
7702         return std::make_pair(0U, nullptr);
7703       }
7704       uint64_t VTSize = VT.getFixedSizeInBits();
7705       if (VTSize == 16)
7706         return std::make_pair(0U, &AArch64::FPR16RegClass);
7707       if (VTSize == 32)
7708         return std::make_pair(0U, &AArch64::FPR32RegClass);
7709       if (VTSize == 64)
7710         return std::make_pair(0U, &AArch64::FPR64RegClass);
7711       if (VTSize == 128)
7712         return std::make_pair(0U, &AArch64::FPR128RegClass);
7713       break;
7714     }
7715     // The instructions that this constraint is designed for can
7716     // only take 128-bit registers so just use that regclass.
7717     case 'x':
7718       if (!Subtarget->hasFPARMv8())
7719         break;
7720       if (VT.isScalableVector())
7721         return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
7722       if (VT.getSizeInBits() == 128)
7723         return std::make_pair(0U, &AArch64::FPR128_loRegClass);
7724       break;
7725     case 'y':
7726       if (!Subtarget->hasFPARMv8())
7727         break;
7728       if (VT.isScalableVector())
7729         return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
7730       break;
7731     }
7732   } else {
7733     PredicateConstraint PC = parsePredicateConstraint(Constraint);
7734     if (PC != PredicateConstraint::Invalid) {
7735       if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
7736         return std::make_pair(0U, nullptr);
7737       bool restricted = (PC == PredicateConstraint::Upl);
7738       return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass)
7739                         : std::make_pair(0U, &AArch64::PPRRegClass);
7740     }
7741   }
7742   if (StringRef("{cc}").equals_lower(Constraint))
7743     return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
7744 
7745   // Use the default implementation in TargetLowering to convert the register
7746   // constraint into a member of a register class.
7747   std::pair<unsigned, const TargetRegisterClass *> Res;
7748   Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
7749 
7750   // Not found as a standard register?
7751   if (!Res.second) {
7752     unsigned Size = Constraint.size();
7753     if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
7754         tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
7755       int RegNo;
7756       bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
7757       if (!Failed && RegNo >= 0 && RegNo <= 31) {
7758         // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
7759         // By default we'll emit v0-v31 for this unless there's a modifier where
7760         // we'll emit the correct register as well.
7761         if (VT != MVT::Other && VT.getSizeInBits() == 64) {
7762           Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
7763           Res.second = &AArch64::FPR64RegClass;
7764         } else {
7765           Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
7766           Res.second = &AArch64::FPR128RegClass;
7767         }
7768       }
7769     }
7770   }
7771 
7772   if (Res.second && !Subtarget->hasFPARMv8() &&
7773       !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
7774       !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
7775     return std::make_pair(0U, nullptr);
7776 
7777   return Res;
7778 }
7779 
7780 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
7781 /// vector.  If it is invalid, don't add anything to Ops.
7782 void AArch64TargetLowering::LowerAsmOperandForConstraint(
7783     SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
7784     SelectionDAG &DAG) const {
7785   SDValue Result;
7786 
7787   // Currently only support length 1 constraints.
7788   if (Constraint.length() != 1)
7789     return;
7790 
7791   char ConstraintLetter = Constraint[0];
7792   switch (ConstraintLetter) {
7793   default:
7794     break;
7795 
7796   // This set of constraints deal with valid constants for various instructions.
7797   // Validate and return a target constant for them if we can.
7798   case 'z': {
7799     // 'z' maps to xzr or wzr so it needs an input of 0.
7800     if (!isNullConstant(Op))
7801       return;
7802 
7803     if (Op.getValueType() == MVT::i64)
7804       Result = DAG.getRegister(AArch64::XZR, MVT::i64);
7805     else
7806       Result = DAG.getRegister(AArch64::WZR, MVT::i32);
7807     break;
7808   }
7809   case 'S': {
7810     // An absolute symbolic address or label reference.
7811     if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
7812       Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
7813                                           GA->getValueType(0));
7814     } else if (const BlockAddressSDNode *BA =
7815                    dyn_cast<BlockAddressSDNode>(Op)) {
7816       Result =
7817           DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0));
7818     } else if (const ExternalSymbolSDNode *ES =
7819                    dyn_cast<ExternalSymbolSDNode>(Op)) {
7820       Result =
7821           DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0));
7822     } else
7823       return;
7824     break;
7825   }
7826 
7827   case 'I':
7828   case 'J':
7829   case 'K':
7830   case 'L':
7831   case 'M':
7832   case 'N':
7833     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
7834     if (!C)
7835       return;
7836 
7837     // Grab the value and do some validation.
7838     uint64_t CVal = C->getZExtValue();
7839     switch (ConstraintLetter) {
7840     // The I constraint applies only to simple ADD or SUB immediate operands:
7841     // i.e. 0 to 4095 with optional shift by 12
7842     // The J constraint applies only to ADD or SUB immediates that would be
7843     // valid when negated, i.e. if [an add pattern] were to be output as a SUB
7844     // instruction [or vice versa], in other words -1 to -4095 with optional
7845     // left shift by 12.
7846     case 'I':
7847       if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
7848         break;
7849       return;
7850     case 'J': {
7851       uint64_t NVal = -C->getSExtValue();
7852       if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
7853         CVal = C->getSExtValue();
7854         break;
7855       }
7856       return;
7857     }
7858     // The K and L constraints apply *only* to logical immediates, including
7859     // what used to be the MOVI alias for ORR (though the MOVI alias has now
7860     // been removed and MOV should be used). So these constraints have to
7861     // distinguish between bit patterns that are valid 32-bit or 64-bit
7862     // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
7863     // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
7864     // versa.
7865     case 'K':
7866       if (AArch64_AM::isLogicalImmediate(CVal, 32))
7867         break;
7868       return;
7869     case 'L':
7870       if (AArch64_AM::isLogicalImmediate(CVal, 64))
7871         break;
7872       return;
7873     // The M and N constraints are a superset of K and L respectively, for use
7874     // with the MOV (immediate) alias. As well as the logical immediates they
7875     // also match 32 or 64-bit immediates that can be loaded either using a
7876     // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
7877     // (M) or 64-bit 0x1234000000000000 (N) etc.
7878     // As a note some of this code is liberally stolen from the asm parser.
7879     case 'M': {
7880       if (!isUInt<32>(CVal))
7881         return;
7882       if (AArch64_AM::isLogicalImmediate(CVal, 32))
7883         break;
7884       if ((CVal & 0xFFFF) == CVal)
7885         break;
7886       if ((CVal & 0xFFFF0000ULL) == CVal)
7887         break;
7888       uint64_t NCVal = ~(uint32_t)CVal;
7889       if ((NCVal & 0xFFFFULL) == NCVal)
7890         break;
7891       if ((NCVal & 0xFFFF0000ULL) == NCVal)
7892         break;
7893       return;
7894     }
7895     case 'N': {
7896       if (AArch64_AM::isLogicalImmediate(CVal, 64))
7897         break;
7898       if ((CVal & 0xFFFFULL) == CVal)
7899         break;
7900       if ((CVal & 0xFFFF0000ULL) == CVal)
7901         break;
7902       if ((CVal & 0xFFFF00000000ULL) == CVal)
7903         break;
7904       if ((CVal & 0xFFFF000000000000ULL) == CVal)
7905         break;
7906       uint64_t NCVal = ~CVal;
7907       if ((NCVal & 0xFFFFULL) == NCVal)
7908         break;
7909       if ((NCVal & 0xFFFF0000ULL) == NCVal)
7910         break;
7911       if ((NCVal & 0xFFFF00000000ULL) == NCVal)
7912         break;
7913       if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
7914         break;
7915       return;
7916     }
7917     default:
7918       return;
7919     }
7920 
7921     // All assembler immediates are 64-bit integers.
7922     Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
7923     break;
7924   }
7925 
7926   if (Result.getNode()) {
7927     Ops.push_back(Result);
7928     return;
7929   }
7930 
7931   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
7932 }
7933 
7934 //===----------------------------------------------------------------------===//
7935 //                     AArch64 Advanced SIMD Support
7936 //===----------------------------------------------------------------------===//
7937 
7938 /// WidenVector - Given a value in the V64 register class, produce the
7939 /// equivalent value in the V128 register class.
7940 static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
7941   EVT VT = V64Reg.getValueType();
7942   unsigned NarrowSize = VT.getVectorNumElements();
7943   MVT EltTy = VT.getVectorElementType().getSimpleVT();
7944   MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
7945   SDLoc DL(V64Reg);
7946 
7947   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
7948                      V64Reg, DAG.getConstant(0, DL, MVT::i32));
7949 }
7950 
7951 /// getExtFactor - Determine the adjustment factor for the position when
7952 /// generating an "extract from vector registers" instruction.
7953 static unsigned getExtFactor(SDValue &V) {
7954   EVT EltType = V.getValueType().getVectorElementType();
7955   return EltType.getSizeInBits() / 8;
7956 }
7957 
7958 /// NarrowVector - Given a value in the V128 register class, produce the
7959 /// equivalent value in the V64 register class.
7960 static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
7961   EVT VT = V128Reg.getValueType();
7962   unsigned WideSize = VT.getVectorNumElements();
7963   MVT EltTy = VT.getVectorElementType().getSimpleVT();
7964   MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
7965   SDLoc DL(V128Reg);
7966 
7967   return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
7968 }
7969 
7970 // Gather data to see if the operation can be modelled as a
7971 // shuffle in combination with VEXTs.
7972 SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
7973                                                   SelectionDAG &DAG) const {
7974   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7975   LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
7976   SDLoc dl(Op);
7977   EVT VT = Op.getValueType();
7978   assert(!VT.isScalableVector() &&
7979          "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
7980   unsigned NumElts = VT.getVectorNumElements();
7981 
7982   struct ShuffleSourceInfo {
7983     SDValue Vec;
7984     unsigned MinElt;
7985     unsigned MaxElt;
7986 
7987     // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
7988     // be compatible with the shuffle we intend to construct. As a result
7989     // ShuffleVec will be some sliding window into the original Vec.
7990     SDValue ShuffleVec;
7991 
7992     // Code should guarantee that element i in Vec starts at element "WindowBase
7993     // + i * WindowScale in ShuffleVec".
7994     int WindowBase;
7995     int WindowScale;
7996 
7997     ShuffleSourceInfo(SDValue Vec)
7998       : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
7999           ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
8000 
8001     bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
8002   };
8003 
8004   // First gather all vectors used as an immediate source for this BUILD_VECTOR
8005   // node.
8006   SmallVector<ShuffleSourceInfo, 2> Sources;
8007   for (unsigned i = 0; i < NumElts; ++i) {
8008     SDValue V = Op.getOperand(i);
8009     if (V.isUndef())
8010       continue;
8011     else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8012              !isa<ConstantSDNode>(V.getOperand(1))) {
8013       LLVM_DEBUG(
8014           dbgs() << "Reshuffle failed: "
8015                     "a shuffle can only come from building a vector from "
8016                     "various elements of other vectors, provided their "
8017                     "indices are constant\n");
8018       return SDValue();
8019     }
8020 
8021     // Add this element source to the list if it's not already there.
8022     SDValue SourceVec = V.getOperand(0);
8023     auto Source = find(Sources, SourceVec);
8024     if (Source == Sources.end())
8025       Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
8026 
8027     // Update the minimum and maximum lane number seen.
8028     unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
8029     Source->MinElt = std::min(Source->MinElt, EltNo);
8030     Source->MaxElt = std::max(Source->MaxElt, EltNo);
8031   }
8032 
8033   if (Sources.size() > 2) {
8034     LLVM_DEBUG(
8035         dbgs() << "Reshuffle failed: currently only do something sane when at "
8036                   "most two source vectors are involved\n");
8037     return SDValue();
8038   }
8039 
8040   // Find out the smallest element size among result and two sources, and use
8041   // it as element size to build the shuffle_vector.
8042   EVT SmallestEltTy = VT.getVectorElementType();
8043   for (auto &Source : Sources) {
8044     EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
8045     if (SrcEltTy.bitsLT(SmallestEltTy)) {
8046       SmallestEltTy = SrcEltTy;
8047     }
8048   }
8049   unsigned ResMultiplier =
8050       VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
8051   uint64_t VTSize = VT.getFixedSizeInBits();
8052   NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
8053   EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
8054 
8055   // If the source vector is too wide or too narrow, we may nevertheless be able
8056   // to construct a compatible shuffle either by concatenating it with UNDEF or
8057   // extracting a suitable range of elements.
8058   for (auto &Src : Sources) {
8059     EVT SrcVT = Src.ShuffleVec.getValueType();
8060 
8061     uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
8062     if (SrcVTSize == VTSize)
8063       continue;
8064 
8065     // This stage of the search produces a source with the same element type as
8066     // the original, but with a total width matching the BUILD_VECTOR output.
8067     EVT EltVT = SrcVT.getVectorElementType();
8068     unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
8069     EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
8070 
8071     if (SrcVTSize < VTSize) {
8072       assert(2 * SrcVTSize == VTSize);
8073       // We can pad out the smaller vector for free, so if it's part of a
8074       // shuffle...
8075       Src.ShuffleVec =
8076           DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
8077                       DAG.getUNDEF(Src.ShuffleVec.getValueType()));
8078       continue;
8079     }
8080 
8081     if (SrcVTSize != 2 * VTSize) {
8082       LLVM_DEBUG(
8083           dbgs() << "Reshuffle failed: result vector too small to extract\n");
8084       return SDValue();
8085     }
8086 
8087     if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
8088       LLVM_DEBUG(
8089           dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
8090       return SDValue();
8091     }
8092 
8093     if (Src.MinElt >= NumSrcElts) {
8094       // The extraction can just take the second half
8095       Src.ShuffleVec =
8096           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8097                       DAG.getConstant(NumSrcElts, dl, MVT::i64));
8098       Src.WindowBase = -NumSrcElts;
8099     } else if (Src.MaxElt < NumSrcElts) {
8100       // The extraction can just take the first half
8101       Src.ShuffleVec =
8102           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8103                       DAG.getConstant(0, dl, MVT::i64));
8104     } else {
8105       // An actual VEXT is needed
8106       SDValue VEXTSrc1 =
8107           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8108                       DAG.getConstant(0, dl, MVT::i64));
8109       SDValue VEXTSrc2 =
8110           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8111                       DAG.getConstant(NumSrcElts, dl, MVT::i64));
8112       unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
8113 
8114       if (!SrcVT.is64BitVector()) {
8115         LLVM_DEBUG(
8116           dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
8117                     "for SVE vectors.");
8118         return SDValue();
8119       }
8120 
8121       Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
8122                                    VEXTSrc2,
8123                                    DAG.getConstant(Imm, dl, MVT::i32));
8124       Src.WindowBase = -Src.MinElt;
8125     }
8126   }
8127 
8128   // Another possible incompatibility occurs from the vector element types. We
8129   // can fix this by bitcasting the source vectors to the same type we intend
8130   // for the shuffle.
8131   for (auto &Src : Sources) {
8132     EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
8133     if (SrcEltTy == SmallestEltTy)
8134       continue;
8135     assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
8136     Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
8137     Src.WindowScale =
8138         SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
8139     Src.WindowBase *= Src.WindowScale;
8140   }
8141 
8142   // Final sanity check before we try to actually produce a shuffle.
8143   LLVM_DEBUG(for (auto Src
8144                   : Sources)
8145                  assert(Src.ShuffleVec.getValueType() == ShuffleVT););
8146 
8147   // The stars all align, our next step is to produce the mask for the shuffle.
8148   SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
8149   int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
8150   for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
8151     SDValue Entry = Op.getOperand(i);
8152     if (Entry.isUndef())
8153       continue;
8154 
8155     auto Src = find(Sources, Entry.getOperand(0));
8156     int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
8157 
8158     // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
8159     // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
8160     // segment.
8161     EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
8162     int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
8163                                VT.getScalarSizeInBits());
8164     int LanesDefined = BitsDefined / BitsPerShuffleLane;
8165 
8166     // This source is expected to fill ResMultiplier lanes of the final shuffle,
8167     // starting at the appropriate offset.
8168     int *LaneMask = &Mask[i * ResMultiplier];
8169 
8170     int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
8171     ExtractBase += NumElts * (Src - Sources.begin());
8172     for (int j = 0; j < LanesDefined; ++j)
8173       LaneMask[j] = ExtractBase + j;
8174   }
8175 
8176   // Final check before we try to produce nonsense...
8177   if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
8178     LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
8179     return SDValue();
8180   }
8181 
8182   SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
8183   for (unsigned i = 0; i < Sources.size(); ++i)
8184     ShuffleOps[i] = Sources[i].ShuffleVec;
8185 
8186   SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
8187                                          ShuffleOps[1], Mask);
8188   SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
8189 
8190   LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
8191              dbgs() << "Reshuffle, creating node: "; V.dump(););
8192 
8193   return V;
8194 }
8195 
8196 // check if an EXT instruction can handle the shuffle mask when the
8197 // vector sources of the shuffle are the same.
8198 static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
8199   unsigned NumElts = VT.getVectorNumElements();
8200 
8201   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
8202   if (M[0] < 0)
8203     return false;
8204 
8205   Imm = M[0];
8206 
8207   // If this is a VEXT shuffle, the immediate value is the index of the first
8208   // element.  The other shuffle indices must be the successive elements after
8209   // the first one.
8210   unsigned ExpectedElt = Imm;
8211   for (unsigned i = 1; i < NumElts; ++i) {
8212     // Increment the expected index.  If it wraps around, just follow it
8213     // back to index zero and keep going.
8214     ++ExpectedElt;
8215     if (ExpectedElt == NumElts)
8216       ExpectedElt = 0;
8217 
8218     if (M[i] < 0)
8219       continue; // ignore UNDEF indices
8220     if (ExpectedElt != static_cast<unsigned>(M[i]))
8221       return false;
8222   }
8223 
8224   return true;
8225 }
8226 
8227 /// Check if a vector shuffle corresponds to a DUP instructions with a larger
8228 /// element width than the vector lane type. If that is the case the function
8229 /// returns true and writes the value of the DUP instruction lane operand into
8230 /// DupLaneOp
8231 static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
8232                           unsigned &DupLaneOp) {
8233   assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
8234          "Only possible block sizes for wide DUP are: 16, 32, 64");
8235 
8236   if (BlockSize <= VT.getScalarSizeInBits())
8237     return false;
8238   if (BlockSize % VT.getScalarSizeInBits() != 0)
8239     return false;
8240   if (VT.getSizeInBits() % BlockSize != 0)
8241     return false;
8242 
8243   size_t SingleVecNumElements = VT.getVectorNumElements();
8244   size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
8245   size_t NumBlocks = VT.getSizeInBits() / BlockSize;
8246 
8247   // We are looking for masks like
8248   // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
8249   // might be replaced by 'undefined'. BlockIndices will eventually contain
8250   // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
8251   // for the above examples)
8252   SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
8253   for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
8254     for (size_t I = 0; I < NumEltsPerBlock; I++) {
8255       int Elt = M[BlockIndex * NumEltsPerBlock + I];
8256       if (Elt < 0)
8257         continue;
8258       // For now we don't support shuffles that use the second operand
8259       if ((unsigned)Elt >= SingleVecNumElements)
8260         return false;
8261       if (BlockElts[I] < 0)
8262         BlockElts[I] = Elt;
8263       else if (BlockElts[I] != Elt)
8264         return false;
8265     }
8266 
8267   // We found a candidate block (possibly with some undefs). It must be a
8268   // sequence of consecutive integers starting with a value divisible by
8269   // NumEltsPerBlock with some values possibly replaced by undef-s.
8270 
8271   // Find first non-undef element
8272   auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
8273   assert(FirstRealEltIter != BlockElts.end() &&
8274          "Shuffle with all-undefs must have been caught by previous cases, "
8275          "e.g. isSplat()");
8276   if (FirstRealEltIter == BlockElts.end()) {
8277     DupLaneOp = 0;
8278     return true;
8279   }
8280 
8281   // Index of FirstRealElt in BlockElts
8282   size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
8283 
8284   if ((unsigned)*FirstRealEltIter < FirstRealIndex)
8285     return false;
8286   // BlockElts[0] must have the following value if it isn't undef:
8287   size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
8288 
8289   // Check the first element
8290   if (Elt0 % NumEltsPerBlock != 0)
8291     return false;
8292   // Check that the sequence indeed consists of consecutive integers (modulo
8293   // undefs)
8294   for (size_t I = 0; I < NumEltsPerBlock; I++)
8295     if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
8296       return false;
8297 
8298   DupLaneOp = Elt0 / NumEltsPerBlock;
8299   return true;
8300 }
8301 
8302 // check if an EXT instruction can handle the shuffle mask when the
8303 // vector sources of the shuffle are different.
8304 static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
8305                       unsigned &Imm) {
8306   // Look for the first non-undef element.
8307   const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
8308 
8309   // Benefit form APInt to handle overflow when calculating expected element.
8310   unsigned NumElts = VT.getVectorNumElements();
8311   unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
8312   APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
8313   // The following shuffle indices must be the successive elements after the
8314   // first real element.
8315   const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
8316       [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
8317   if (FirstWrongElt != M.end())
8318     return false;
8319 
8320   // The index of an EXT is the first element if it is not UNDEF.
8321   // Watch out for the beginning UNDEFs. The EXT index should be the expected
8322   // value of the first element.  E.g.
8323   // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
8324   // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
8325   // ExpectedElt is the last mask index plus 1.
8326   Imm = ExpectedElt.getZExtValue();
8327 
8328   // There are two difference cases requiring to reverse input vectors.
8329   // For example, for vector <4 x i32> we have the following cases,
8330   // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
8331   // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
8332   // For both cases, we finally use mask <5, 6, 7, 0>, which requires
8333   // to reverse two input vectors.
8334   if (Imm < NumElts)
8335     ReverseEXT = true;
8336   else
8337     Imm -= NumElts;
8338 
8339   return true;
8340 }
8341 
8342 /// isREVMask - Check if a vector shuffle corresponds to a REV
8343 /// instruction with the specified blocksize.  (The order of the elements
8344 /// within each block of the vector is reversed.)
8345 static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
8346   assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
8347          "Only possible block sizes for REV are: 16, 32, 64");
8348 
8349   unsigned EltSz = VT.getScalarSizeInBits();
8350   if (EltSz == 64)
8351     return false;
8352 
8353   unsigned NumElts = VT.getVectorNumElements();
8354   unsigned BlockElts = M[0] + 1;
8355   // If the first shuffle index is UNDEF, be optimistic.
8356   if (M[0] < 0)
8357     BlockElts = BlockSize / EltSz;
8358 
8359   if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
8360     return false;
8361 
8362   for (unsigned i = 0; i < NumElts; ++i) {
8363     if (M[i] < 0)
8364       continue; // ignore UNDEF indices
8365     if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
8366       return false;
8367   }
8368 
8369   return true;
8370 }
8371 
8372 static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
8373   unsigned NumElts = VT.getVectorNumElements();
8374   if (NumElts % 2 != 0)
8375     return false;
8376   WhichResult = (M[0] == 0 ? 0 : 1);
8377   unsigned Idx = WhichResult * NumElts / 2;
8378   for (unsigned i = 0; i != NumElts; i += 2) {
8379     if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
8380         (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
8381       return false;
8382     Idx += 1;
8383   }
8384 
8385   return true;
8386 }
8387 
8388 static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
8389   unsigned NumElts = VT.getVectorNumElements();
8390   WhichResult = (M[0] == 0 ? 0 : 1);
8391   for (unsigned i = 0; i != NumElts; ++i) {
8392     if (M[i] < 0)
8393       continue; // ignore UNDEF indices
8394     if ((unsigned)M[i] != 2 * i + WhichResult)
8395       return false;
8396   }
8397 
8398   return true;
8399 }
8400 
8401 static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
8402   unsigned NumElts = VT.getVectorNumElements();
8403   if (NumElts % 2 != 0)
8404     return false;
8405   WhichResult = (M[0] == 0 ? 0 : 1);
8406   for (unsigned i = 0; i < NumElts; i += 2) {
8407     if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
8408         (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
8409       return false;
8410   }
8411   return true;
8412 }
8413 
8414 /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
8415 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
8416 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
8417 static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
8418   unsigned NumElts = VT.getVectorNumElements();
8419   if (NumElts % 2 != 0)
8420     return false;
8421   WhichResult = (M[0] == 0 ? 0 : 1);
8422   unsigned Idx = WhichResult * NumElts / 2;
8423   for (unsigned i = 0; i != NumElts; i += 2) {
8424     if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
8425         (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
8426       return false;
8427     Idx += 1;
8428   }
8429 
8430   return true;
8431 }
8432 
8433 /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
8434 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
8435 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
8436 static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
8437   unsigned Half = VT.getVectorNumElements() / 2;
8438   WhichResult = (M[0] == 0 ? 0 : 1);
8439   for (unsigned j = 0; j != 2; ++j) {
8440     unsigned Idx = WhichResult;
8441     for (unsigned i = 0; i != Half; ++i) {
8442       int MIdx = M[i + j * Half];
8443       if (MIdx >= 0 && (unsigned)MIdx != Idx)
8444         return false;
8445       Idx += 2;
8446     }
8447   }
8448 
8449   return true;
8450 }
8451 
8452 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
8453 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
8454 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
8455 static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
8456   unsigned NumElts = VT.getVectorNumElements();
8457   if (NumElts % 2 != 0)
8458     return false;
8459   WhichResult = (M[0] == 0 ? 0 : 1);
8460   for (unsigned i = 0; i < NumElts; i += 2) {
8461     if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
8462         (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
8463       return false;
8464   }
8465   return true;
8466 }
8467 
8468 static bool isINSMask(ArrayRef<int> M, int NumInputElements,
8469                       bool &DstIsLeft, int &Anomaly) {
8470   if (M.size() != static_cast<size_t>(NumInputElements))
8471     return false;
8472 
8473   int NumLHSMatch = 0, NumRHSMatch = 0;
8474   int LastLHSMismatch = -1, LastRHSMismatch = -1;
8475 
8476   for (int i = 0; i < NumInputElements; ++i) {
8477     if (M[i] == -1) {
8478       ++NumLHSMatch;
8479       ++NumRHSMatch;
8480       continue;
8481     }
8482 
8483     if (M[i] == i)
8484       ++NumLHSMatch;
8485     else
8486       LastLHSMismatch = i;
8487 
8488     if (M[i] == i + NumInputElements)
8489       ++NumRHSMatch;
8490     else
8491       LastRHSMismatch = i;
8492   }
8493 
8494   if (NumLHSMatch == NumInputElements - 1) {
8495     DstIsLeft = true;
8496     Anomaly = LastLHSMismatch;
8497     return true;
8498   } else if (NumRHSMatch == NumInputElements - 1) {
8499     DstIsLeft = false;
8500     Anomaly = LastRHSMismatch;
8501     return true;
8502   }
8503 
8504   return false;
8505 }
8506 
8507 static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
8508   if (VT.getSizeInBits() != 128)
8509     return false;
8510 
8511   unsigned NumElts = VT.getVectorNumElements();
8512 
8513   for (int I = 0, E = NumElts / 2; I != E; I++) {
8514     if (Mask[I] != I)
8515       return false;
8516   }
8517 
8518   int Offset = NumElts / 2;
8519   for (int I = NumElts / 2, E = NumElts; I != E; I++) {
8520     if (Mask[I] != I + SplitLHS * Offset)
8521       return false;
8522   }
8523 
8524   return true;
8525 }
8526 
8527 static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
8528   SDLoc DL(Op);
8529   EVT VT = Op.getValueType();
8530   SDValue V0 = Op.getOperand(0);
8531   SDValue V1 = Op.getOperand(1);
8532   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
8533 
8534   if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
8535       VT.getVectorElementType() != V1.getValueType().getVectorElementType())
8536     return SDValue();
8537 
8538   bool SplitV0 = V0.getValueSizeInBits() == 128;
8539 
8540   if (!isConcatMask(Mask, VT, SplitV0))
8541     return SDValue();
8542 
8543   EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
8544   if (SplitV0) {
8545     V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
8546                      DAG.getConstant(0, DL, MVT::i64));
8547   }
8548   if (V1.getValueSizeInBits() == 128) {
8549     V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
8550                      DAG.getConstant(0, DL, MVT::i64));
8551   }
8552   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
8553 }
8554 
8555 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
8556 /// the specified operations to build the shuffle.
8557 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
8558                                       SDValue RHS, SelectionDAG &DAG,
8559                                       const SDLoc &dl) {
8560   unsigned OpNum = (PFEntry >> 26) & 0x0F;
8561   unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
8562   unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
8563 
8564   enum {
8565     OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
8566     OP_VREV,
8567     OP_VDUP0,
8568     OP_VDUP1,
8569     OP_VDUP2,
8570     OP_VDUP3,
8571     OP_VEXT1,
8572     OP_VEXT2,
8573     OP_VEXT3,
8574     OP_VUZPL, // VUZP, left result
8575     OP_VUZPR, // VUZP, right result
8576     OP_VZIPL, // VZIP, left result
8577     OP_VZIPR, // VZIP, right result
8578     OP_VTRNL, // VTRN, left result
8579     OP_VTRNR  // VTRN, right result
8580   };
8581 
8582   if (OpNum == OP_COPY) {
8583     if (LHSID == (1 * 9 + 2) * 9 + 3)
8584       return LHS;
8585     assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
8586     return RHS;
8587   }
8588 
8589   SDValue OpLHS, OpRHS;
8590   OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
8591   OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
8592   EVT VT = OpLHS.getValueType();
8593 
8594   switch (OpNum) {
8595   default:
8596     llvm_unreachable("Unknown shuffle opcode!");
8597   case OP_VREV:
8598     // VREV divides the vector in half and swaps within the half.
8599     if (VT.getVectorElementType() == MVT::i32 ||
8600         VT.getVectorElementType() == MVT::f32)
8601       return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
8602     // vrev <4 x i16> -> REV32
8603     if (VT.getVectorElementType() == MVT::i16 ||
8604         VT.getVectorElementType() == MVT::f16 ||
8605         VT.getVectorElementType() == MVT::bf16)
8606       return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
8607     // vrev <4 x i8> -> REV16
8608     assert(VT.getVectorElementType() == MVT::i8);
8609     return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
8610   case OP_VDUP0:
8611   case OP_VDUP1:
8612   case OP_VDUP2:
8613   case OP_VDUP3: {
8614     EVT EltTy = VT.getVectorElementType();
8615     unsigned Opcode;
8616     if (EltTy == MVT::i8)
8617       Opcode = AArch64ISD::DUPLANE8;
8618     else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
8619       Opcode = AArch64ISD::DUPLANE16;
8620     else if (EltTy == MVT::i32 || EltTy == MVT::f32)
8621       Opcode = AArch64ISD::DUPLANE32;
8622     else if (EltTy == MVT::i64 || EltTy == MVT::f64)
8623       Opcode = AArch64ISD::DUPLANE64;
8624     else
8625       llvm_unreachable("Invalid vector element type?");
8626 
8627     if (VT.getSizeInBits() == 64)
8628       OpLHS = WidenVector(OpLHS, DAG);
8629     SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
8630     return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
8631   }
8632   case OP_VEXT1:
8633   case OP_VEXT2:
8634   case OP_VEXT3: {
8635     unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
8636     return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
8637                        DAG.getConstant(Imm, dl, MVT::i32));
8638   }
8639   case OP_VUZPL:
8640     return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
8641                        OpRHS);
8642   case OP_VUZPR:
8643     return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
8644                        OpRHS);
8645   case OP_VZIPL:
8646     return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
8647                        OpRHS);
8648   case OP_VZIPR:
8649     return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
8650                        OpRHS);
8651   case OP_VTRNL:
8652     return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
8653                        OpRHS);
8654   case OP_VTRNR:
8655     return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
8656                        OpRHS);
8657   }
8658 }
8659 
8660 static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
8661                            SelectionDAG &DAG) {
8662   // Check to see if we can use the TBL instruction.
8663   SDValue V1 = Op.getOperand(0);
8664   SDValue V2 = Op.getOperand(1);
8665   SDLoc DL(Op);
8666 
8667   EVT EltVT = Op.getValueType().getVectorElementType();
8668   unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
8669 
8670   SmallVector<SDValue, 8> TBLMask;
8671   for (int Val : ShuffleMask) {
8672     for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
8673       unsigned Offset = Byte + Val * BytesPerElt;
8674       TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
8675     }
8676   }
8677 
8678   MVT IndexVT = MVT::v8i8;
8679   unsigned IndexLen = 8;
8680   if (Op.getValueSizeInBits() == 128) {
8681     IndexVT = MVT::v16i8;
8682     IndexLen = 16;
8683   }
8684 
8685   SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
8686   SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
8687 
8688   SDValue Shuffle;
8689   if (V2.getNode()->isUndef()) {
8690     if (IndexLen == 8)
8691       V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
8692     Shuffle = DAG.getNode(
8693         ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
8694         DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
8695         DAG.getBuildVector(IndexVT, DL,
8696                            makeArrayRef(TBLMask.data(), IndexLen)));
8697   } else {
8698     if (IndexLen == 8) {
8699       V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
8700       Shuffle = DAG.getNode(
8701           ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
8702           DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
8703           DAG.getBuildVector(IndexVT, DL,
8704                              makeArrayRef(TBLMask.data(), IndexLen)));
8705     } else {
8706       // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
8707       // cannot currently represent the register constraints on the input
8708       // table registers.
8709       //  Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
8710       //                   DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
8711       //                   IndexLen));
8712       Shuffle = DAG.getNode(
8713           ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
8714           DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
8715           V2Cst, DAG.getBuildVector(IndexVT, DL,
8716                                     makeArrayRef(TBLMask.data(), IndexLen)));
8717     }
8718   }
8719   return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
8720 }
8721 
8722 static unsigned getDUPLANEOp(EVT EltType) {
8723   if (EltType == MVT::i8)
8724     return AArch64ISD::DUPLANE8;
8725   if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
8726     return AArch64ISD::DUPLANE16;
8727   if (EltType == MVT::i32 || EltType == MVT::f32)
8728     return AArch64ISD::DUPLANE32;
8729   if (EltType == MVT::i64 || EltType == MVT::f64)
8730     return AArch64ISD::DUPLANE64;
8731 
8732   llvm_unreachable("Invalid vector element type?");
8733 }
8734 
8735 static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
8736                             unsigned Opcode, SelectionDAG &DAG) {
8737   // Try to eliminate a bitcasted extract subvector before a DUPLANE.
8738   auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
8739     // Match: dup (bitcast (extract_subv X, C)), LaneC
8740     if (BitCast.getOpcode() != ISD::BITCAST ||
8741         BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
8742       return false;
8743 
8744     // The extract index must align in the destination type. That may not
8745     // happen if the bitcast is from narrow to wide type.
8746     SDValue Extract = BitCast.getOperand(0);
8747     unsigned ExtIdx = Extract.getConstantOperandVal(1);
8748     unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
8749     unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
8750     unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
8751     if (ExtIdxInBits % CastedEltBitWidth != 0)
8752       return false;
8753 
8754     // Update the lane value by offsetting with the scaled extract index.
8755     LaneC += ExtIdxInBits / CastedEltBitWidth;
8756 
8757     // Determine the casted vector type of the wide vector input.
8758     // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
8759     // Examples:
8760     // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
8761     // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
8762     unsigned SrcVecNumElts =
8763         Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
8764     CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(),
8765                               SrcVecNumElts);
8766     return true;
8767   };
8768   MVT CastVT;
8769   if (getScaledOffsetDup(V, Lane, CastVT)) {
8770     V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
8771   } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
8772     // The lane is incremented by the index of the extract.
8773     // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
8774     Lane += V.getConstantOperandVal(1);
8775     V = V.getOperand(0);
8776   } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
8777     // The lane is decremented if we are splatting from the 2nd operand.
8778     // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
8779     unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
8780     Lane -= Idx * VT.getVectorNumElements() / 2;
8781     V = WidenVector(V.getOperand(Idx), DAG);
8782   } else if (VT.getSizeInBits() == 64) {
8783     // Widen the operand to 128-bit register with undef.
8784     V = WidenVector(V, DAG);
8785   }
8786   return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
8787 }
8788 
8789 SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
8790                                                    SelectionDAG &DAG) const {
8791   SDLoc dl(Op);
8792   EVT VT = Op.getValueType();
8793 
8794   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
8795 
8796   // Convert shuffles that are directly supported on NEON to target-specific
8797   // DAG nodes, instead of keeping them as shuffles and matching them again
8798   // during code selection.  This is more efficient and avoids the possibility
8799   // of inconsistencies between legalization and selection.
8800   ArrayRef<int> ShuffleMask = SVN->getMask();
8801 
8802   SDValue V1 = Op.getOperand(0);
8803   SDValue V2 = Op.getOperand(1);
8804 
8805   if (SVN->isSplat()) {
8806     int Lane = SVN->getSplatIndex();
8807     // If this is undef splat, generate it via "just" vdup, if possible.
8808     if (Lane == -1)
8809       Lane = 0;
8810 
8811     if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
8812       return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
8813                          V1.getOperand(0));
8814     // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
8815     // constant. If so, we can just reference the lane's definition directly.
8816     if (V1.getOpcode() == ISD::BUILD_VECTOR &&
8817         !isa<ConstantSDNode>(V1.getOperand(Lane)))
8818       return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
8819 
8820     // Otherwise, duplicate from the lane of the input vector.
8821     unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
8822     return constructDup(V1, Lane, dl, VT, Opcode, DAG);
8823   }
8824 
8825   // Check if the mask matches a DUP for a wider element
8826   for (unsigned LaneSize : {64U, 32U, 16U}) {
8827     unsigned Lane = 0;
8828     if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
8829       unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
8830                                        : LaneSize == 32 ? AArch64ISD::DUPLANE32
8831                                                         : AArch64ISD::DUPLANE16;
8832       // Cast V1 to an integer vector with required lane size
8833       MVT NewEltTy = MVT::getIntegerVT(LaneSize);
8834       unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
8835       MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
8836       V1 = DAG.getBitcast(NewVecTy, V1);
8837       // Constuct the DUP instruction
8838       V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
8839       // Cast back to the original type
8840       return DAG.getBitcast(VT, V1);
8841     }
8842   }
8843 
8844   if (isREVMask(ShuffleMask, VT, 64))
8845     return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
8846   if (isREVMask(ShuffleMask, VT, 32))
8847     return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
8848   if (isREVMask(ShuffleMask, VT, 16))
8849     return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
8850 
8851   bool ReverseEXT = false;
8852   unsigned Imm;
8853   if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
8854     if (ReverseEXT)
8855       std::swap(V1, V2);
8856     Imm *= getExtFactor(V1);
8857     return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
8858                        DAG.getConstant(Imm, dl, MVT::i32));
8859   } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
8860     Imm *= getExtFactor(V1);
8861     return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
8862                        DAG.getConstant(Imm, dl, MVT::i32));
8863   }
8864 
8865   unsigned WhichResult;
8866   if (isZIPMask(ShuffleMask, VT, WhichResult)) {
8867     unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
8868     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
8869   }
8870   if (isUZPMask(ShuffleMask, VT, WhichResult)) {
8871     unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
8872     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
8873   }
8874   if (isTRNMask(ShuffleMask, VT, WhichResult)) {
8875     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
8876     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
8877   }
8878 
8879   if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
8880     unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
8881     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
8882   }
8883   if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
8884     unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
8885     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
8886   }
8887   if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
8888     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
8889     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
8890   }
8891 
8892   if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
8893     return Concat;
8894 
8895   bool DstIsLeft;
8896   int Anomaly;
8897   int NumInputElements = V1.getValueType().getVectorNumElements();
8898   if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
8899     SDValue DstVec = DstIsLeft ? V1 : V2;
8900     SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
8901 
8902     SDValue SrcVec = V1;
8903     int SrcLane = ShuffleMask[Anomaly];
8904     if (SrcLane >= NumInputElements) {
8905       SrcVec = V2;
8906       SrcLane -= VT.getVectorNumElements();
8907     }
8908     SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
8909 
8910     EVT ScalarVT = VT.getVectorElementType();
8911 
8912     if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
8913       ScalarVT = MVT::i32;
8914 
8915     return DAG.getNode(
8916         ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
8917         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
8918         DstLaneV);
8919   }
8920 
8921   // If the shuffle is not directly supported and it has 4 elements, use
8922   // the PerfectShuffle-generated table to synthesize it from other shuffles.
8923   unsigned NumElts = VT.getVectorNumElements();
8924   if (NumElts == 4) {
8925     unsigned PFIndexes[4];
8926     for (unsigned i = 0; i != 4; ++i) {
8927       if (ShuffleMask[i] < 0)
8928         PFIndexes[i] = 8;
8929       else
8930         PFIndexes[i] = ShuffleMask[i];
8931     }
8932 
8933     // Compute the index in the perfect shuffle table.
8934     unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
8935                             PFIndexes[2] * 9 + PFIndexes[3];
8936     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8937     unsigned Cost = (PFEntry >> 30);
8938 
8939     if (Cost <= 4)
8940       return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8941   }
8942 
8943   return GenerateTBL(Op, ShuffleMask, DAG);
8944 }
8945 
8946 SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
8947                                                  SelectionDAG &DAG) const {
8948   SDLoc dl(Op);
8949   EVT VT = Op.getValueType();
8950   EVT ElemVT = VT.getScalarType();
8951   SDValue SplatVal = Op.getOperand(0);
8952 
8953   if (useSVEForFixedLengthVectorVT(VT))
8954     return LowerToScalableOp(Op, DAG);
8955 
8956   // Extend input splat value where needed to fit into a GPR (32b or 64b only)
8957   // FPRs don't have this restriction.
8958   switch (ElemVT.getSimpleVT().SimpleTy) {
8959   case MVT::i1: {
8960     // The only legal i1 vectors are SVE vectors, so we can use SVE-specific
8961     // lowering code.
8962     if (auto *ConstVal = dyn_cast<ConstantSDNode>(SplatVal)) {
8963       if (ConstVal->isOne())
8964         return getPTrue(DAG, dl, VT, AArch64SVEPredPattern::all);
8965       // TODO: Add special case for constant false
8966     }
8967     // The general case of i1.  There isn't any natural way to do this,
8968     // so we use some trickery with whilelo.
8969     SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
8970     SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i64, SplatVal,
8971                            DAG.getValueType(MVT::i1));
8972     SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl,
8973                                        MVT::i64);
8974     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID,
8975                        DAG.getConstant(0, dl, MVT::i64), SplatVal);
8976   }
8977   case MVT::i8:
8978   case MVT::i16:
8979   case MVT::i32:
8980     SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32);
8981     break;
8982   case MVT::i64:
8983     SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
8984     break;
8985   case MVT::f16:
8986   case MVT::bf16:
8987   case MVT::f32:
8988   case MVT::f64:
8989     // Fine as is
8990     break;
8991   default:
8992     report_fatal_error("Unsupported SPLAT_VECTOR input operand type");
8993   }
8994 
8995   return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
8996 }
8997 
8998 SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
8999                                              SelectionDAG &DAG) const {
9000   SDLoc DL(Op);
9001 
9002   EVT VT = Op.getValueType();
9003   if (!isTypeLegal(VT) || !VT.isScalableVector())
9004     return SDValue();
9005 
9006   // Current lowering only supports the SVE-ACLE types.
9007   if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock)
9008     return SDValue();
9009 
9010   // The DUPQ operation is indepedent of element type so normalise to i64s.
9011   SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
9012   SDValue Idx128 = Op.getOperand(2);
9013 
9014   // DUPQ can be used when idx is in range.
9015   auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
9016   if (CIdx && (CIdx->getZExtValue() <= 3)) {
9017     SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
9018     SDNode *DUPQ =
9019         DAG.getMachineNode(AArch64::DUP_ZZI_Q, DL, MVT::nxv2i64, V, CI);
9020     return DAG.getNode(ISD::BITCAST, DL, VT, SDValue(DUPQ, 0));
9021   }
9022 
9023   // The ACLE says this must produce the same result as:
9024   //   svtbl(data, svadd_x(svptrue_b64(),
9025   //                       svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
9026   //                       index * 2))
9027   SDValue One = DAG.getConstant(1, DL, MVT::i64);
9028   SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
9029 
9030   // create the vector 0,1,0,1,...
9031   SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
9032   SDValue SV = DAG.getNode(AArch64ISD::INDEX_VECTOR,
9033                            DL, MVT::nxv2i64, Zero, One);
9034   SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
9035 
9036   // create the vector idx64,idx64+1,idx64,idx64+1,...
9037   SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
9038   SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
9039   SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
9040 
9041   // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
9042   SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
9043   return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
9044 }
9045 
9046 
9047 static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
9048                                APInt &UndefBits) {
9049   EVT VT = BVN->getValueType(0);
9050   APInt SplatBits, SplatUndef;
9051   unsigned SplatBitSize;
9052   bool HasAnyUndefs;
9053   if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
9054     unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
9055 
9056     for (unsigned i = 0; i < NumSplats; ++i) {
9057       CnstBits <<= SplatBitSize;
9058       UndefBits <<= SplatBitSize;
9059       CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
9060       UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
9061     }
9062 
9063     return true;
9064   }
9065 
9066   return false;
9067 }
9068 
9069 // Try 64-bit splatted SIMD immediate.
9070 static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
9071                                  const APInt &Bits) {
9072   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9073     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9074     EVT VT = Op.getValueType();
9075     MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
9076 
9077     if (AArch64_AM::isAdvSIMDModImmType10(Value)) {
9078       Value = AArch64_AM::encodeAdvSIMDModImmType10(Value);
9079 
9080       SDLoc dl(Op);
9081       SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
9082                                 DAG.getConstant(Value, dl, MVT::i32));
9083       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9084     }
9085   }
9086 
9087   return SDValue();
9088 }
9089 
9090 // Try 32-bit splatted SIMD immediate.
9091 static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
9092                                   const APInt &Bits,
9093                                   const SDValue *LHS = nullptr) {
9094   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9095     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9096     EVT VT = Op.getValueType();
9097     MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
9098     bool isAdvSIMDModImm = false;
9099     uint64_t Shift;
9100 
9101     if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
9102       Value = AArch64_AM::encodeAdvSIMDModImmType1(Value);
9103       Shift = 0;
9104     }
9105     else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
9106       Value = AArch64_AM::encodeAdvSIMDModImmType2(Value);
9107       Shift = 8;
9108     }
9109     else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
9110       Value = AArch64_AM::encodeAdvSIMDModImmType3(Value);
9111       Shift = 16;
9112     }
9113     else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
9114       Value = AArch64_AM::encodeAdvSIMDModImmType4(Value);
9115       Shift = 24;
9116     }
9117 
9118     if (isAdvSIMDModImm) {
9119       SDLoc dl(Op);
9120       SDValue Mov;
9121 
9122       if (LHS)
9123         Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
9124                           DAG.getConstant(Value, dl, MVT::i32),
9125                           DAG.getConstant(Shift, dl, MVT::i32));
9126       else
9127         Mov = DAG.getNode(NewOp, dl, MovTy,
9128                           DAG.getConstant(Value, dl, MVT::i32),
9129                           DAG.getConstant(Shift, dl, MVT::i32));
9130 
9131       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9132     }
9133   }
9134 
9135   return SDValue();
9136 }
9137 
9138 // Try 16-bit splatted SIMD immediate.
9139 static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
9140                                   const APInt &Bits,
9141                                   const SDValue *LHS = nullptr) {
9142   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9143     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9144     EVT VT = Op.getValueType();
9145     MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
9146     bool isAdvSIMDModImm = false;
9147     uint64_t Shift;
9148 
9149     if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
9150       Value = AArch64_AM::encodeAdvSIMDModImmType5(Value);
9151       Shift = 0;
9152     }
9153     else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
9154       Value = AArch64_AM::encodeAdvSIMDModImmType6(Value);
9155       Shift = 8;
9156     }
9157 
9158     if (isAdvSIMDModImm) {
9159       SDLoc dl(Op);
9160       SDValue Mov;
9161 
9162       if (LHS)
9163         Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
9164                           DAG.getConstant(Value, dl, MVT::i32),
9165                           DAG.getConstant(Shift, dl, MVT::i32));
9166       else
9167         Mov = DAG.getNode(NewOp, dl, MovTy,
9168                           DAG.getConstant(Value, dl, MVT::i32),
9169                           DAG.getConstant(Shift, dl, MVT::i32));
9170 
9171       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9172     }
9173   }
9174 
9175   return SDValue();
9176 }
9177 
9178 // Try 32-bit splatted SIMD immediate with shifted ones.
9179 static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op,
9180                                     SelectionDAG &DAG, const APInt &Bits) {
9181   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9182     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9183     EVT VT = Op.getValueType();
9184     MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
9185     bool isAdvSIMDModImm = false;
9186     uint64_t Shift;
9187 
9188     if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
9189       Value = AArch64_AM::encodeAdvSIMDModImmType7(Value);
9190       Shift = 264;
9191     }
9192     else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
9193       Value = AArch64_AM::encodeAdvSIMDModImmType8(Value);
9194       Shift = 272;
9195     }
9196 
9197     if (isAdvSIMDModImm) {
9198       SDLoc dl(Op);
9199       SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
9200                                 DAG.getConstant(Value, dl, MVT::i32),
9201                                 DAG.getConstant(Shift, dl, MVT::i32));
9202       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9203     }
9204   }
9205 
9206   return SDValue();
9207 }
9208 
9209 // Try 8-bit splatted SIMD immediate.
9210 static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
9211                                  const APInt &Bits) {
9212   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9213     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9214     EVT VT = Op.getValueType();
9215     MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
9216 
9217     if (AArch64_AM::isAdvSIMDModImmType9(Value)) {
9218       Value = AArch64_AM::encodeAdvSIMDModImmType9(Value);
9219 
9220       SDLoc dl(Op);
9221       SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
9222                                 DAG.getConstant(Value, dl, MVT::i32));
9223       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9224     }
9225   }
9226 
9227   return SDValue();
9228 }
9229 
9230 // Try FP splatted SIMD immediate.
9231 static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
9232                                   const APInt &Bits) {
9233   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
9234     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
9235     EVT VT = Op.getValueType();
9236     bool isWide = (VT.getSizeInBits() == 128);
9237     MVT MovTy;
9238     bool isAdvSIMDModImm = false;
9239 
9240     if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
9241       Value = AArch64_AM::encodeAdvSIMDModImmType11(Value);
9242       MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
9243     }
9244     else if (isWide &&
9245              (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
9246       Value = AArch64_AM::encodeAdvSIMDModImmType12(Value);
9247       MovTy = MVT::v2f64;
9248     }
9249 
9250     if (isAdvSIMDModImm) {
9251       SDLoc dl(Op);
9252       SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
9253                                 DAG.getConstant(Value, dl, MVT::i32));
9254       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
9255     }
9256   }
9257 
9258   return SDValue();
9259 }
9260 
9261 // Specialized code to quickly find if PotentialBVec is a BuildVector that
9262 // consists of only the same constant int value, returned in reference arg
9263 // ConstVal
9264 static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
9265                                      uint64_t &ConstVal) {
9266   BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
9267   if (!Bvec)
9268     return false;
9269   ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
9270   if (!FirstElt)
9271     return false;
9272   EVT VT = Bvec->getValueType(0);
9273   unsigned NumElts = VT.getVectorNumElements();
9274   for (unsigned i = 1; i < NumElts; ++i)
9275     if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
9276       return false;
9277   ConstVal = FirstElt->getZExtValue();
9278   return true;
9279 }
9280 
9281 static unsigned getIntrinsicID(const SDNode *N) {
9282   unsigned Opcode = N->getOpcode();
9283   switch (Opcode) {
9284   default:
9285     return Intrinsic::not_intrinsic;
9286   case ISD::INTRINSIC_WO_CHAIN: {
9287     unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
9288     if (IID < Intrinsic::num_intrinsics)
9289       return IID;
9290     return Intrinsic::not_intrinsic;
9291   }
9292   }
9293 }
9294 
9295 // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
9296 // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
9297 // BUILD_VECTORs with constant element C1, C2 is a constant, and:
9298 //   - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
9299 //   - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
9300 // The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
9301 static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
9302   EVT VT = N->getValueType(0);
9303 
9304   if (!VT.isVector())
9305     return SDValue();
9306 
9307   SDLoc DL(N);
9308 
9309   SDValue And;
9310   SDValue Shift;
9311 
9312   SDValue FirstOp = N->getOperand(0);
9313   unsigned FirstOpc = FirstOp.getOpcode();
9314   SDValue SecondOp = N->getOperand(1);
9315   unsigned SecondOpc = SecondOp.getOpcode();
9316 
9317   // Is one of the operands an AND or a BICi? The AND may have been optimised to
9318   // a BICi in order to use an immediate instead of a register.
9319   // Is the other operand an shl or lshr? This will have been turned into:
9320   // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift.
9321   if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
9322       (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR)) {
9323     And = FirstOp;
9324     Shift = SecondOp;
9325 
9326   } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
9327              (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR)) {
9328     And = SecondOp;
9329     Shift = FirstOp;
9330   } else
9331     return SDValue();
9332 
9333   bool IsAnd = And.getOpcode() == ISD::AND;
9334   bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR;
9335 
9336   // Is the shift amount constant?
9337   ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
9338   if (!C2node)
9339     return SDValue();
9340 
9341   uint64_t C1;
9342   if (IsAnd) {
9343     // Is the and mask vector all constant?
9344     if (!isAllConstantBuildVector(And.getOperand(1), C1))
9345       return SDValue();
9346   } else {
9347     // Reconstruct the corresponding AND immediate from the two BICi immediates.
9348     ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
9349     ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
9350     assert(C1nodeImm && C1nodeShift);
9351     C1 = ~(C1nodeImm->getZExtValue() << C1nodeShift->getZExtValue());
9352   }
9353 
9354   // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
9355   // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
9356   // how much one can shift elements of a particular size?
9357   uint64_t C2 = C2node->getZExtValue();
9358   unsigned ElemSizeInBits = VT.getScalarSizeInBits();
9359   if (C2 > ElemSizeInBits)
9360     return SDValue();
9361 
9362   APInt C1AsAPInt(ElemSizeInBits, C1);
9363   APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
9364                                   : APInt::getLowBitsSet(ElemSizeInBits, C2);
9365   if (C1AsAPInt != RequiredC1)
9366     return SDValue();
9367 
9368   SDValue X = And.getOperand(0);
9369   SDValue Y = Shift.getOperand(0);
9370 
9371   unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
9372   SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Shift.getOperand(1));
9373 
9374   LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
9375   LLVM_DEBUG(N->dump(&DAG));
9376   LLVM_DEBUG(dbgs() << "into: \n");
9377   LLVM_DEBUG(ResultSLI->dump(&DAG));
9378 
9379   ++NumShiftInserts;
9380   return ResultSLI;
9381 }
9382 
9383 SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
9384                                              SelectionDAG &DAG) const {
9385   if (useSVEForFixedLengthVectorVT(Op.getValueType()))
9386     return LowerToScalableOp(Op, DAG);
9387 
9388   // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
9389   if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
9390     return Res;
9391 
9392   EVT VT = Op.getValueType();
9393 
9394   SDValue LHS = Op.getOperand(0);
9395   BuildVectorSDNode *BVN =
9396       dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
9397   if (!BVN) {
9398     // OR commutes, so try swapping the operands.
9399     LHS = Op.getOperand(1);
9400     BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
9401   }
9402   if (!BVN)
9403     return Op;
9404 
9405   APInt DefBits(VT.getSizeInBits(), 0);
9406   APInt UndefBits(VT.getSizeInBits(), 0);
9407   if (resolveBuildVector(BVN, DefBits, UndefBits)) {
9408     SDValue NewOp;
9409 
9410     if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
9411                                     DefBits, &LHS)) ||
9412         (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
9413                                     DefBits, &LHS)))
9414       return NewOp;
9415 
9416     if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
9417                                     UndefBits, &LHS)) ||
9418         (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
9419                                     UndefBits, &LHS)))
9420       return NewOp;
9421   }
9422 
9423   // We can always fall back to a non-immediate OR.
9424   return Op;
9425 }
9426 
9427 // Normalize the operands of BUILD_VECTOR. The value of constant operands will
9428 // be truncated to fit element width.
9429 static SDValue NormalizeBuildVector(SDValue Op,
9430                                     SelectionDAG &DAG) {
9431   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
9432   SDLoc dl(Op);
9433   EVT VT = Op.getValueType();
9434   EVT EltTy= VT.getVectorElementType();
9435 
9436   if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
9437     return Op;
9438 
9439   SmallVector<SDValue, 16> Ops;
9440   for (SDValue Lane : Op->ops()) {
9441     // For integer vectors, type legalization would have promoted the
9442     // operands already. Otherwise, if Op is a floating-point splat
9443     // (with operands cast to integers), then the only possibilities
9444     // are constants and UNDEFs.
9445     if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
9446       APInt LowBits(EltTy.getSizeInBits(),
9447                     CstLane->getZExtValue());
9448       Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
9449     } else if (Lane.getNode()->isUndef()) {
9450       Lane = DAG.getUNDEF(MVT::i32);
9451     } else {
9452       assert(Lane.getValueType() == MVT::i32 &&
9453              "Unexpected BUILD_VECTOR operand type");
9454     }
9455     Ops.push_back(Lane);
9456   }
9457   return DAG.getBuildVector(VT, dl, Ops);
9458 }
9459 
9460 static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) {
9461   EVT VT = Op.getValueType();
9462 
9463   APInt DefBits(VT.getSizeInBits(), 0);
9464   APInt UndefBits(VT.getSizeInBits(), 0);
9465   BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
9466   if (resolveBuildVector(BVN, DefBits, UndefBits)) {
9467     SDValue NewOp;
9468     if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
9469         (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
9470         (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
9471         (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
9472         (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
9473         (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
9474       return NewOp;
9475 
9476     DefBits = ~DefBits;
9477     if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
9478         (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
9479         (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
9480       return NewOp;
9481 
9482     DefBits = UndefBits;
9483     if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
9484         (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
9485         (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
9486         (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
9487         (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
9488         (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
9489       return NewOp;
9490 
9491     DefBits = ~UndefBits;
9492     if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
9493         (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
9494         (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
9495       return NewOp;
9496   }
9497 
9498   return SDValue();
9499 }
9500 
9501 SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
9502                                                  SelectionDAG &DAG) const {
9503   EVT VT = Op.getValueType();
9504 
9505   // Try to build a simple constant vector.
9506   Op = NormalizeBuildVector(Op, DAG);
9507   if (VT.isInteger()) {
9508     // Certain vector constants, used to express things like logical NOT and
9509     // arithmetic NEG, are passed through unmodified.  This allows special
9510     // patterns for these operations to match, which will lower these constants
9511     // to whatever is proven necessary.
9512     BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
9513     if (BVN->isConstant())
9514       if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
9515         unsigned BitSize = VT.getVectorElementType().getSizeInBits();
9516         APInt Val(BitSize,
9517                   Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
9518         if (Val.isNullValue() || Val.isAllOnesValue())
9519           return Op;
9520       }
9521   }
9522 
9523   if (SDValue V = ConstantBuildVector(Op, DAG))
9524     return V;
9525 
9526   // Scan through the operands to find some interesting properties we can
9527   // exploit:
9528   //   1) If only one value is used, we can use a DUP, or
9529   //   2) if only the low element is not undef, we can just insert that, or
9530   //   3) if only one constant value is used (w/ some non-constant lanes),
9531   //      we can splat the constant value into the whole vector then fill
9532   //      in the non-constant lanes.
9533   //   4) FIXME: If different constant values are used, but we can intelligently
9534   //             select the values we'll be overwriting for the non-constant
9535   //             lanes such that we can directly materialize the vector
9536   //             some other way (MOVI, e.g.), we can be sneaky.
9537   //   5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
9538   SDLoc dl(Op);
9539   unsigned NumElts = VT.getVectorNumElements();
9540   bool isOnlyLowElement = true;
9541   bool usesOnlyOneValue = true;
9542   bool usesOnlyOneConstantValue = true;
9543   bool isConstant = true;
9544   bool AllLanesExtractElt = true;
9545   unsigned NumConstantLanes = 0;
9546   unsigned NumDifferentLanes = 0;
9547   unsigned NumUndefLanes = 0;
9548   SDValue Value;
9549   SDValue ConstantValue;
9550   for (unsigned i = 0; i < NumElts; ++i) {
9551     SDValue V = Op.getOperand(i);
9552     if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9553       AllLanesExtractElt = false;
9554     if (V.isUndef()) {
9555       ++NumUndefLanes;
9556       continue;
9557     }
9558     if (i > 0)
9559       isOnlyLowElement = false;
9560     if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
9561       isConstant = false;
9562 
9563     if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) {
9564       ++NumConstantLanes;
9565       if (!ConstantValue.getNode())
9566         ConstantValue = V;
9567       else if (ConstantValue != V)
9568         usesOnlyOneConstantValue = false;
9569     }
9570 
9571     if (!Value.getNode())
9572       Value = V;
9573     else if (V != Value) {
9574       usesOnlyOneValue = false;
9575       ++NumDifferentLanes;
9576     }
9577   }
9578 
9579   if (!Value.getNode()) {
9580     LLVM_DEBUG(
9581         dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
9582     return DAG.getUNDEF(VT);
9583   }
9584 
9585   // Convert BUILD_VECTOR where all elements but the lowest are undef into
9586   // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
9587   // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
9588   if (isOnlyLowElement && !(NumElts == 1 && isa<ConstantSDNode>(Value))) {
9589     LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
9590                          "SCALAR_TO_VECTOR node\n");
9591     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
9592   }
9593 
9594   if (AllLanesExtractElt) {
9595     SDNode *Vector = nullptr;
9596     bool Even = false;
9597     bool Odd = false;
9598     // Check whether the extract elements match the Even pattern <0,2,4,...> or
9599     // the Odd pattern <1,3,5,...>.
9600     for (unsigned i = 0; i < NumElts; ++i) {
9601       SDValue V = Op.getOperand(i);
9602       const SDNode *N = V.getNode();
9603       if (!isa<ConstantSDNode>(N->getOperand(1)))
9604         break;
9605       SDValue N0 = N->getOperand(0);
9606 
9607       // All elements are extracted from the same vector.
9608       if (!Vector) {
9609         Vector = N0.getNode();
9610         // Check that the type of EXTRACT_VECTOR_ELT matches the type of
9611         // BUILD_VECTOR.
9612         if (VT.getVectorElementType() !=
9613             N0.getValueType().getVectorElementType())
9614           break;
9615       } else if (Vector != N0.getNode()) {
9616         Odd = false;
9617         Even = false;
9618         break;
9619       }
9620 
9621       // Extracted values are either at Even indices <0,2,4,...> or at Odd
9622       // indices <1,3,5,...>.
9623       uint64_t Val = N->getConstantOperandVal(1);
9624       if (Val == 2 * i) {
9625         Even = true;
9626         continue;
9627       }
9628       if (Val - 1 == 2 * i) {
9629         Odd = true;
9630         continue;
9631       }
9632 
9633       // Something does not match: abort.
9634       Odd = false;
9635       Even = false;
9636       break;
9637     }
9638     if (Even || Odd) {
9639       SDValue LHS =
9640           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
9641                       DAG.getConstant(0, dl, MVT::i64));
9642       SDValue RHS =
9643           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
9644                       DAG.getConstant(NumElts, dl, MVT::i64));
9645 
9646       if (Even && !Odd)
9647         return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
9648                            RHS);
9649       if (Odd && !Even)
9650         return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
9651                            RHS);
9652     }
9653   }
9654 
9655   // Use DUP for non-constant splats. For f32 constant splats, reduce to
9656   // i32 and try again.
9657   if (usesOnlyOneValue) {
9658     if (!isConstant) {
9659       if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9660           Value.getValueType() != VT) {
9661         LLVM_DEBUG(
9662             dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
9663         return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
9664       }
9665 
9666       // This is actually a DUPLANExx operation, which keeps everything vectory.
9667 
9668       SDValue Lane = Value.getOperand(1);
9669       Value = Value.getOperand(0);
9670       if (Value.getValueSizeInBits() == 64) {
9671         LLVM_DEBUG(
9672             dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
9673                       "widening it\n");
9674         Value = WidenVector(Value, DAG);
9675       }
9676 
9677       unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
9678       return DAG.getNode(Opcode, dl, VT, Value, Lane);
9679     }
9680 
9681     if (VT.getVectorElementType().isFloatingPoint()) {
9682       SmallVector<SDValue, 8> Ops;
9683       EVT EltTy = VT.getVectorElementType();
9684       assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
9685                EltTy == MVT::f64) && "Unsupported floating-point vector type");
9686       LLVM_DEBUG(
9687           dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
9688                     "BITCASTS, and try again\n");
9689       MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
9690       for (unsigned i = 0; i < NumElts; ++i)
9691         Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
9692       EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
9693       SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
9694       LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
9695                  Val.dump(););
9696       Val = LowerBUILD_VECTOR(Val, DAG);
9697       if (Val.getNode())
9698         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
9699     }
9700   }
9701 
9702   // If we need to insert a small number of different non-constant elements and
9703   // the vector width is sufficiently large, prefer using DUP with the common
9704   // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
9705   // skip the constant lane handling below.
9706   bool PreferDUPAndInsert =
9707       !isConstant && NumDifferentLanes >= 1 &&
9708       NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
9709       NumDifferentLanes >= NumConstantLanes;
9710 
9711   // If there was only one constant value used and for more than one lane,
9712   // start by splatting that value, then replace the non-constant lanes. This
9713   // is better than the default, which will perform a separate initialization
9714   // for each lane.
9715   if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
9716     // Firstly, try to materialize the splat constant.
9717     SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue),
9718             Val = ConstantBuildVector(Vec, DAG);
9719     if (!Val) {
9720       // Otherwise, materialize the constant and splat it.
9721       Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
9722       DAG.ReplaceAllUsesWith(Vec.getNode(), &Val);
9723     }
9724 
9725     // Now insert the non-constant lanes.
9726     for (unsigned i = 0; i < NumElts; ++i) {
9727       SDValue V = Op.getOperand(i);
9728       SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
9729       if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V))
9730         // Note that type legalization likely mucked about with the VT of the
9731         // source operand, so we may have to convert it here before inserting.
9732         Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
9733     }
9734     return Val;
9735   }
9736 
9737   // This will generate a load from the constant pool.
9738   if (isConstant) {
9739     LLVM_DEBUG(
9740         dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
9741                   "expansion\n");
9742     return SDValue();
9743   }
9744 
9745   // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
9746   if (NumElts >= 4) {
9747     if (SDValue shuffle = ReconstructShuffle(Op, DAG))
9748       return shuffle;
9749   }
9750 
9751   if (PreferDUPAndInsert) {
9752     // First, build a constant vector with the common element.
9753     SmallVector<SDValue, 8> Ops;
9754     for (unsigned I = 0; I < NumElts; ++I)
9755       Ops.push_back(Value);
9756     SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
9757     // Next, insert the elements that do not match the common value.
9758     for (unsigned I = 0; I < NumElts; ++I)
9759       if (Op.getOperand(I) != Value)
9760         NewVector =
9761             DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
9762                         Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
9763 
9764     return NewVector;
9765   }
9766 
9767   // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
9768   // know the default expansion would otherwise fall back on something even
9769   // worse. For a vector with one or two non-undef values, that's
9770   // scalar_to_vector for the elements followed by a shuffle (provided the
9771   // shuffle is valid for the target) and materialization element by element
9772   // on the stack followed by a load for everything else.
9773   if (!isConstant && !usesOnlyOneValue) {
9774     LLVM_DEBUG(
9775         dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
9776                   "of INSERT_VECTOR_ELT\n");
9777 
9778     SDValue Vec = DAG.getUNDEF(VT);
9779     SDValue Op0 = Op.getOperand(0);
9780     unsigned i = 0;
9781 
9782     // Use SCALAR_TO_VECTOR for lane zero to
9783     // a) Avoid a RMW dependency on the full vector register, and
9784     // b) Allow the register coalescer to fold away the copy if the
9785     //    value is already in an S or D register, and we're forced to emit an
9786     //    INSERT_SUBREG that we can't fold anywhere.
9787     //
9788     // We also allow types like i8 and i16 which are illegal scalar but legal
9789     // vector element types. After type-legalization the inserted value is
9790     // extended (i32) and it is safe to cast them to the vector type by ignoring
9791     // the upper bits of the lowest lane (e.g. v8i8, v4i16).
9792     if (!Op0.isUndef()) {
9793       LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
9794       Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
9795       ++i;
9796     }
9797     LLVM_DEBUG(if (i < NumElts) dbgs()
9798                    << "Creating nodes for the other vector elements:\n";);
9799     for (; i < NumElts; ++i) {
9800       SDValue V = Op.getOperand(i);
9801       if (V.isUndef())
9802         continue;
9803       SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
9804       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
9805     }
9806     return Vec;
9807   }
9808 
9809   LLVM_DEBUG(
9810       dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
9811                 "better alternative\n");
9812   return SDValue();
9813 }
9814 
9815 SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
9816                                                    SelectionDAG &DAG) const {
9817   assert(Op.getValueType().isScalableVector() &&
9818          isTypeLegal(Op.getValueType()) &&
9819          "Expected legal scalable vector type!");
9820 
9821   if (isTypeLegal(Op.getOperand(0).getValueType()) && Op.getNumOperands() == 2)
9822     return Op;
9823 
9824   return SDValue();
9825 }
9826 
9827 SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
9828                                                       SelectionDAG &DAG) const {
9829   assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
9830 
9831   // Check for non-constant or out of range lane.
9832   EVT VT = Op.getOperand(0).getValueType();
9833   ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
9834   if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
9835     return SDValue();
9836 
9837 
9838   // Insertion/extraction are legal for V128 types.
9839   if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
9840       VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
9841       VT == MVT::v8f16 || VT == MVT::v8bf16)
9842     return Op;
9843 
9844   if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
9845       VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
9846       VT != MVT::v4bf16)
9847     return SDValue();
9848 
9849   // For V64 types, we perform insertion by expanding the value
9850   // to a V128 type and perform the insertion on that.
9851   SDLoc DL(Op);
9852   SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
9853   EVT WideTy = WideVec.getValueType();
9854 
9855   SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
9856                              Op.getOperand(1), Op.getOperand(2));
9857   // Re-narrow the resultant vector.
9858   return NarrowVector(Node, DAG);
9859 }
9860 
9861 SDValue
9862 AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
9863                                                SelectionDAG &DAG) const {
9864   assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
9865 
9866   // Check for non-constant or out of range lane.
9867   EVT VT = Op.getOperand(0).getValueType();
9868   ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
9869   if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
9870     return SDValue();
9871 
9872 
9873   // Insertion/extraction are legal for V128 types.
9874   if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
9875       VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
9876       VT == MVT::v8f16 || VT == MVT::v8bf16)
9877     return Op;
9878 
9879   if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
9880       VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
9881       VT != MVT::v4bf16)
9882     return SDValue();
9883 
9884   // For V64 types, we perform extraction by expanding the value
9885   // to a V128 type and perform the extraction on that.
9886   SDLoc DL(Op);
9887   SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
9888   EVT WideTy = WideVec.getValueType();
9889 
9890   EVT ExtrTy = WideTy.getVectorElementType();
9891   if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
9892     ExtrTy = MVT::i32;
9893 
9894   // For extractions, we just return the result directly.
9895   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
9896                      Op.getOperand(1));
9897 }
9898 
9899 SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
9900                                                       SelectionDAG &DAG) const {
9901   assert(Op.getValueType().isFixedLengthVector() &&
9902          "Only cases that extract a fixed length vector are supported!");
9903 
9904   EVT InVT = Op.getOperand(0).getValueType();
9905   unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
9906   unsigned Size = Op.getValueSizeInBits();
9907 
9908   if (InVT.isScalableVector()) {
9909     // This will be matched by custom code during ISelDAGToDAG.
9910     if (Idx == 0 && isPackedVectorType(InVT, DAG))
9911       return Op;
9912 
9913     return SDValue();
9914   }
9915 
9916   // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
9917   if (Idx == 0 && InVT.getSizeInBits() <= 128)
9918     return Op;
9919 
9920   // If this is extracting the upper 64-bits of a 128-bit vector, we match
9921   // that directly.
9922   if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 &&
9923       InVT.getSizeInBits() == 128)
9924     return Op;
9925 
9926   return SDValue();
9927 }
9928 
9929 SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
9930                                                      SelectionDAG &DAG) const {
9931   assert(Op.getValueType().isScalableVector() &&
9932          "Only expect to lower inserts into scalable vectors!");
9933 
9934   EVT InVT = Op.getOperand(1).getValueType();
9935   unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
9936 
9937   if (InVT.isScalableVector()) {
9938     SDLoc DL(Op);
9939     EVT VT = Op.getValueType();
9940 
9941     if (!isTypeLegal(VT) || !VT.isInteger())
9942       return SDValue();
9943 
9944     SDValue Vec0 = Op.getOperand(0);
9945     SDValue Vec1 = Op.getOperand(1);
9946 
9947     // Ensure the subvector is half the size of the main vector.
9948     if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
9949       return SDValue();
9950 
9951     // Extend elements of smaller vector...
9952     EVT WideVT = InVT.widenIntegerVectorElementType(*(DAG.getContext()));
9953     SDValue ExtVec = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
9954 
9955     if (Idx == 0) {
9956       SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
9957       return DAG.getNode(AArch64ISD::UZP1, DL, VT, ExtVec, HiVec0);
9958     } else if (Idx == InVT.getVectorMinNumElements()) {
9959       SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
9960       return DAG.getNode(AArch64ISD::UZP1, DL, VT, LoVec0, ExtVec);
9961     }
9962 
9963     return SDValue();
9964   }
9965 
9966   // This will be matched by custom code during ISelDAGToDAG.
9967   if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef())
9968     return Op;
9969 
9970   return SDValue();
9971 }
9972 
9973 SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
9974   EVT VT = Op.getValueType();
9975 
9976   if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
9977     return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
9978 
9979   assert(VT.isScalableVector() && "Expected a scalable vector.");
9980 
9981   bool Signed = Op.getOpcode() == ISD::SDIV;
9982   unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
9983 
9984   if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
9985     return LowerToPredicatedOp(Op, DAG, PredOpcode);
9986 
9987   // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
9988   // operations, and truncate the result.
9989   EVT WidenedVT;
9990   if (VT == MVT::nxv16i8)
9991     WidenedVT = MVT::nxv8i16;
9992   else if (VT == MVT::nxv8i16)
9993     WidenedVT = MVT::nxv4i32;
9994   else
9995     llvm_unreachable("Unexpected Custom DIV operation");
9996 
9997   SDLoc dl(Op);
9998   unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
9999   unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
10000   SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
10001   SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
10002   SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
10003   SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
10004   SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
10005   SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
10006   return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
10007 }
10008 
10009 bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
10010   // Currently no fixed length shuffles that require SVE are legal.
10011   if (useSVEForFixedLengthVectorVT(VT))
10012     return false;
10013 
10014   if (VT.getVectorNumElements() == 4 &&
10015       (VT.is128BitVector() || VT.is64BitVector())) {
10016     unsigned PFIndexes[4];
10017     for (unsigned i = 0; i != 4; ++i) {
10018       if (M[i] < 0)
10019         PFIndexes[i] = 8;
10020       else
10021         PFIndexes[i] = M[i];
10022     }
10023 
10024     // Compute the index in the perfect shuffle table.
10025     unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
10026                             PFIndexes[2] * 9 + PFIndexes[3];
10027     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
10028     unsigned Cost = (PFEntry >> 30);
10029 
10030     if (Cost <= 4)
10031       return true;
10032   }
10033 
10034   bool DummyBool;
10035   int DummyInt;
10036   unsigned DummyUnsigned;
10037 
10038   return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
10039           isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
10040           isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
10041           // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
10042           isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
10043           isZIPMask(M, VT, DummyUnsigned) ||
10044           isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
10045           isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
10046           isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
10047           isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
10048           isConcatMask(M, VT, VT.getSizeInBits() == 128));
10049 }
10050 
10051 /// getVShiftImm - Check if this is a valid build_vector for the immediate
10052 /// operand of a vector shift operation, where all the elements of the
10053 /// build_vector must have the same constant integer value.
10054 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
10055   // Ignore bit_converts.
10056   while (Op.getOpcode() == ISD::BITCAST)
10057     Op = Op.getOperand(0);
10058   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
10059   APInt SplatBits, SplatUndef;
10060   unsigned SplatBitSize;
10061   bool HasAnyUndefs;
10062   if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
10063                                     HasAnyUndefs, ElementBits) ||
10064       SplatBitSize > ElementBits)
10065     return false;
10066   Cnt = SplatBits.getSExtValue();
10067   return true;
10068 }
10069 
10070 /// isVShiftLImm - Check if this is a valid build_vector for the immediate
10071 /// operand of a vector shift left operation.  That value must be in the range:
10072 ///   0 <= Value < ElementBits for a left shift; or
10073 ///   0 <= Value <= ElementBits for a long left shift.
10074 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
10075   assert(VT.isVector() && "vector shift count is not a vector type");
10076   int64_t ElementBits = VT.getScalarSizeInBits();
10077   if (!getVShiftImm(Op, ElementBits, Cnt))
10078     return false;
10079   return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
10080 }
10081 
10082 /// isVShiftRImm - Check if this is a valid build_vector for the immediate
10083 /// operand of a vector shift right operation. The value must be in the range:
10084 ///   1 <= Value <= ElementBits for a right shift; or
10085 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
10086   assert(VT.isVector() && "vector shift count is not a vector type");
10087   int64_t ElementBits = VT.getScalarSizeInBits();
10088   if (!getVShiftImm(Op, ElementBits, Cnt))
10089     return false;
10090   return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
10091 }
10092 
10093 SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
10094                                              SelectionDAG &DAG) const {
10095   EVT VT = Op.getValueType();
10096 
10097   if (VT.getScalarType() == MVT::i1) {
10098     // Lower i1 truncate to `(x & 1) != 0`.
10099     SDLoc dl(Op);
10100     EVT OpVT = Op.getOperand(0).getValueType();
10101     SDValue Zero = DAG.getConstant(0, dl, OpVT);
10102     SDValue One = DAG.getConstant(1, dl, OpVT);
10103     SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
10104     return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
10105   }
10106 
10107   if (!VT.isVector() || VT.isScalableVector())
10108     return SDValue();
10109 
10110   if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
10111     return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
10112 
10113   return SDValue();
10114 }
10115 
10116 SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
10117                                                       SelectionDAG &DAG) const {
10118   EVT VT = Op.getValueType();
10119   SDLoc DL(Op);
10120   int64_t Cnt;
10121 
10122   if (!Op.getOperand(1).getValueType().isVector())
10123     return Op;
10124   unsigned EltSize = VT.getScalarSizeInBits();
10125 
10126   switch (Op.getOpcode()) {
10127   default:
10128     llvm_unreachable("unexpected shift opcode");
10129 
10130   case ISD::SHL:
10131     if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT))
10132       return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
10133 
10134     if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
10135       return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
10136                          DAG.getConstant(Cnt, DL, MVT::i32));
10137     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
10138                        DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
10139                                        MVT::i32),
10140                        Op.getOperand(0), Op.getOperand(1));
10141   case ISD::SRA:
10142   case ISD::SRL:
10143     if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) {
10144       unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
10145                                                 : AArch64ISD::SRL_PRED;
10146       return LowerToPredicatedOp(Op, DAG, Opc);
10147     }
10148 
10149     // Right shift immediate
10150     if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
10151       unsigned Opc =
10152           (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
10153       return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
10154                          DAG.getConstant(Cnt, DL, MVT::i32));
10155     }
10156 
10157     // Right shift register.  Note, there is not a shift right register
10158     // instruction, but the shift left register instruction takes a signed
10159     // value, where negative numbers specify a right shift.
10160     unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
10161                                                 : Intrinsic::aarch64_neon_ushl;
10162     // negate the shift amount
10163     SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1));
10164     SDValue NegShiftLeft =
10165         DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
10166                     DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
10167                     NegShift);
10168     return NegShiftLeft;
10169   }
10170 
10171   return SDValue();
10172 }
10173 
10174 static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
10175                                     AArch64CC::CondCode CC, bool NoNans, EVT VT,
10176                                     const SDLoc &dl, SelectionDAG &DAG) {
10177   EVT SrcVT = LHS.getValueType();
10178   assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
10179          "function only supposed to emit natural comparisons");
10180 
10181   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
10182   APInt CnstBits(VT.getSizeInBits(), 0);
10183   APInt UndefBits(VT.getSizeInBits(), 0);
10184   bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
10185   bool IsZero = IsCnst && (CnstBits == 0);
10186 
10187   if (SrcVT.getVectorElementType().isFloatingPoint()) {
10188     switch (CC) {
10189     default:
10190       return SDValue();
10191     case AArch64CC::NE: {
10192       SDValue Fcmeq;
10193       if (IsZero)
10194         Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
10195       else
10196         Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
10197       return DAG.getNOT(dl, Fcmeq, VT);
10198     }
10199     case AArch64CC::EQ:
10200       if (IsZero)
10201         return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
10202       return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
10203     case AArch64CC::GE:
10204       if (IsZero)
10205         return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
10206       return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
10207     case AArch64CC::GT:
10208       if (IsZero)
10209         return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
10210       return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
10211     case AArch64CC::LS:
10212       if (IsZero)
10213         return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
10214       return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
10215     case AArch64CC::LT:
10216       if (!NoNans)
10217         return SDValue();
10218       // If we ignore NaNs then we can use to the MI implementation.
10219       LLVM_FALLTHROUGH;
10220     case AArch64CC::MI:
10221       if (IsZero)
10222         return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
10223       return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
10224     }
10225   }
10226 
10227   switch (CC) {
10228   default:
10229     return SDValue();
10230   case AArch64CC::NE: {
10231     SDValue Cmeq;
10232     if (IsZero)
10233       Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
10234     else
10235       Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
10236     return DAG.getNOT(dl, Cmeq, VT);
10237   }
10238   case AArch64CC::EQ:
10239     if (IsZero)
10240       return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
10241     return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
10242   case AArch64CC::GE:
10243     if (IsZero)
10244       return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
10245     return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
10246   case AArch64CC::GT:
10247     if (IsZero)
10248       return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
10249     return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
10250   case AArch64CC::LE:
10251     if (IsZero)
10252       return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
10253     return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
10254   case AArch64CC::LS:
10255     return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
10256   case AArch64CC::LO:
10257     return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
10258   case AArch64CC::LT:
10259     if (IsZero)
10260       return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
10261     return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
10262   case AArch64CC::HI:
10263     return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
10264   case AArch64CC::HS:
10265     return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
10266   }
10267 }
10268 
10269 SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
10270                                            SelectionDAG &DAG) const {
10271   if (Op.getValueType().isScalableVector()) {
10272     if (Op.getOperand(0).getValueType().isFloatingPoint())
10273       return Op;
10274     return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
10275   }
10276 
10277   if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
10278     return LowerFixedLengthVectorSetccToSVE(Op, DAG);
10279 
10280   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
10281   SDValue LHS = Op.getOperand(0);
10282   SDValue RHS = Op.getOperand(1);
10283   EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
10284   SDLoc dl(Op);
10285 
10286   if (LHS.getValueType().getVectorElementType().isInteger()) {
10287     assert(LHS.getValueType() == RHS.getValueType());
10288     AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
10289     SDValue Cmp =
10290         EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
10291     return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
10292   }
10293 
10294   const bool FullFP16 =
10295     static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
10296 
10297   // Make v4f16 (only) fcmp operations utilise vector instructions
10298   // v8f16 support will be a litle more complicated
10299   if (!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) {
10300     if (LHS.getValueType().getVectorNumElements() == 4) {
10301       LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
10302       RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
10303       SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
10304       DAG.ReplaceAllUsesWith(Op, NewSetcc);
10305       CmpVT = MVT::v4i32;
10306     } else
10307       return SDValue();
10308   }
10309 
10310   assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
10311           LHS.getValueType().getVectorElementType() != MVT::f128);
10312 
10313   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
10314   // clean.  Some of them require two branches to implement.
10315   AArch64CC::CondCode CC1, CC2;
10316   bool ShouldInvert;
10317   changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
10318 
10319   bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
10320   SDValue Cmp =
10321       EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
10322   if (!Cmp.getNode())
10323     return SDValue();
10324 
10325   if (CC2 != AArch64CC::AL) {
10326     SDValue Cmp2 =
10327         EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
10328     if (!Cmp2.getNode())
10329       return SDValue();
10330 
10331     Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
10332   }
10333 
10334   Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
10335 
10336   if (ShouldInvert)
10337     Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
10338 
10339   return Cmp;
10340 }
10341 
10342 static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
10343                                   SelectionDAG &DAG) {
10344   SDValue VecOp = ScalarOp.getOperand(0);
10345   auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
10346   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
10347                      DAG.getConstant(0, DL, MVT::i64));
10348 }
10349 
10350 SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
10351                                               SelectionDAG &DAG) const {
10352   SDValue Src = Op.getOperand(0);
10353 
10354   // Try to lower fixed length reductions to SVE.
10355   EVT SrcVT = Src.getValueType();
10356   bool OverrideNEON = Op.getOpcode() == ISD::VECREDUCE_AND ||
10357                       Op.getOpcode() == ISD::VECREDUCE_OR ||
10358                       Op.getOpcode() == ISD::VECREDUCE_XOR ||
10359                       Op.getOpcode() == ISD::VECREDUCE_FADD ||
10360                       (Op.getOpcode() != ISD::VECREDUCE_ADD &&
10361                        SrcVT.getVectorElementType() == MVT::i64);
10362   if (SrcVT.isScalableVector() ||
10363       useSVEForFixedLengthVectorVT(SrcVT, OverrideNEON)) {
10364 
10365     if (SrcVT.getVectorElementType() == MVT::i1)
10366       return LowerPredReductionToSVE(Op, DAG);
10367 
10368     switch (Op.getOpcode()) {
10369     case ISD::VECREDUCE_ADD:
10370       return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
10371     case ISD::VECREDUCE_AND:
10372       return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
10373     case ISD::VECREDUCE_OR:
10374       return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
10375     case ISD::VECREDUCE_SMAX:
10376       return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
10377     case ISD::VECREDUCE_SMIN:
10378       return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
10379     case ISD::VECREDUCE_UMAX:
10380       return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
10381     case ISD::VECREDUCE_UMIN:
10382       return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
10383     case ISD::VECREDUCE_XOR:
10384       return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
10385     case ISD::VECREDUCE_FADD:
10386       return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
10387     case ISD::VECREDUCE_FMAX:
10388       return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
10389     case ISD::VECREDUCE_FMIN:
10390       return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
10391     default:
10392       llvm_unreachable("Unhandled fixed length reduction");
10393     }
10394   }
10395 
10396   // Lower NEON reductions.
10397   SDLoc dl(Op);
10398   switch (Op.getOpcode()) {
10399   case ISD::VECREDUCE_ADD:
10400     return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
10401   case ISD::VECREDUCE_SMAX:
10402     return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
10403   case ISD::VECREDUCE_SMIN:
10404     return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
10405   case ISD::VECREDUCE_UMAX:
10406     return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
10407   case ISD::VECREDUCE_UMIN:
10408     return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
10409   case ISD::VECREDUCE_FMAX: {
10410     return DAG.getNode(
10411         ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
10412         DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32),
10413         Src);
10414   }
10415   case ISD::VECREDUCE_FMIN: {
10416     return DAG.getNode(
10417         ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
10418         DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32),
10419         Src);
10420   }
10421   default:
10422     llvm_unreachable("Unhandled reduction");
10423   }
10424 }
10425 
10426 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
10427                                                     SelectionDAG &DAG) const {
10428   auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
10429   if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
10430     return SDValue();
10431 
10432   // LSE has an atomic load-add instruction, but not a load-sub.
10433   SDLoc dl(Op);
10434   MVT VT = Op.getSimpleValueType();
10435   SDValue RHS = Op.getOperand(2);
10436   AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
10437   RHS = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), RHS);
10438   return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, AN->getMemoryVT(),
10439                        Op.getOperand(0), Op.getOperand(1), RHS,
10440                        AN->getMemOperand());
10441 }
10442 
10443 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
10444                                                     SelectionDAG &DAG) const {
10445   auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
10446   if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
10447     return SDValue();
10448 
10449   // LSE has an atomic load-clear instruction, but not a load-and.
10450   SDLoc dl(Op);
10451   MVT VT = Op.getSimpleValueType();
10452   SDValue RHS = Op.getOperand(2);
10453   AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
10454   RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
10455   return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
10456                        Op.getOperand(0), Op.getOperand(1), RHS,
10457                        AN->getMemOperand());
10458 }
10459 
10460 SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
10461     SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
10462   SDLoc dl(Op);
10463   EVT PtrVT = getPointerTy(DAG.getDataLayout());
10464   SDValue Callee = DAG.getTargetExternalSymbol("__chkstk", PtrVT, 0);
10465 
10466   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10467   const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
10468   if (Subtarget->hasCustomCallingConv())
10469     TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
10470 
10471   Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
10472                      DAG.getConstant(4, dl, MVT::i64));
10473   Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
10474   Chain =
10475       DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
10476                   Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
10477                   DAG.getRegisterMask(Mask), Chain.getValue(1));
10478   // To match the actual intent better, we should read the output from X15 here
10479   // again (instead of potentially spilling it to the stack), but rereading Size
10480   // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
10481   // here.
10482 
10483   Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
10484                      DAG.getConstant(4, dl, MVT::i64));
10485   return Chain;
10486 }
10487 
10488 SDValue
10489 AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
10490                                                SelectionDAG &DAG) const {
10491   assert(Subtarget->isTargetWindows() &&
10492          "Only Windows alloca probing supported");
10493   SDLoc dl(Op);
10494   // Get the inputs.
10495   SDNode *Node = Op.getNode();
10496   SDValue Chain = Op.getOperand(0);
10497   SDValue Size = Op.getOperand(1);
10498   MaybeAlign Align =
10499       cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
10500   EVT VT = Node->getValueType(0);
10501 
10502   if (DAG.getMachineFunction().getFunction().hasFnAttribute(
10503           "no-stack-arg-probe")) {
10504     SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
10505     Chain = SP.getValue(1);
10506     SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
10507     if (Align)
10508       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
10509                        DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
10510     Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
10511     SDValue Ops[2] = {SP, Chain};
10512     return DAG.getMergeValues(Ops, dl);
10513   }
10514 
10515   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
10516 
10517   Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);
10518 
10519   SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
10520   Chain = SP.getValue(1);
10521   SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
10522   if (Align)
10523     SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
10524                      DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
10525   Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
10526 
10527   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
10528                              DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
10529 
10530   SDValue Ops[2] = {SP, Chain};
10531   return DAG.getMergeValues(Ops, dl);
10532 }
10533 
10534 SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
10535                                            SelectionDAG &DAG) const {
10536   EVT VT = Op.getValueType();
10537   assert(VT != MVT::i64 && "Expected illegal VSCALE node");
10538 
10539   SDLoc DL(Op);
10540   APInt MulImm = cast<ConstantSDNode>(Op.getOperand(0))->getAPIntValue();
10541   return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sextOrSelf(64)),
10542                             DL, VT);
10543 }
10544 
10545 /// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
10546 template <unsigned NumVecs>
10547 static bool
10548 setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL,
10549               AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI) {
10550   Info.opc = ISD::INTRINSIC_VOID;
10551   // Retrieve EC from first vector argument.
10552   const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
10553   ElementCount EC = VT.getVectorElementCount();
10554 #ifndef NDEBUG
10555   // Check the assumption that all input vectors are the same type.
10556   for (unsigned I = 0; I < NumVecs; ++I)
10557     assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
10558            "Invalid type.");
10559 #endif
10560   // memVT is `NumVecs * VT`.
10561   Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(),
10562                                 EC * NumVecs);
10563   Info.ptrVal = CI.getArgOperand(CI.getNumArgOperands() - 1);
10564   Info.offset = 0;
10565   Info.align.reset();
10566   Info.flags = MachineMemOperand::MOStore;
10567   return true;
10568 }
10569 
10570 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
10571 /// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
10572 /// specified in the intrinsic calls.
10573 bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
10574                                                const CallInst &I,
10575                                                MachineFunction &MF,
10576                                                unsigned Intrinsic) const {
10577   auto &DL = I.getModule()->getDataLayout();
10578   switch (Intrinsic) {
10579   case Intrinsic::aarch64_sve_st2:
10580     return setInfoSVEStN<2>(*this, DL, Info, I);
10581   case Intrinsic::aarch64_sve_st3:
10582     return setInfoSVEStN<3>(*this, DL, Info, I);
10583   case Intrinsic::aarch64_sve_st4:
10584     return setInfoSVEStN<4>(*this, DL, Info, I);
10585   case Intrinsic::aarch64_neon_ld2:
10586   case Intrinsic::aarch64_neon_ld3:
10587   case Intrinsic::aarch64_neon_ld4:
10588   case Intrinsic::aarch64_neon_ld1x2:
10589   case Intrinsic::aarch64_neon_ld1x3:
10590   case Intrinsic::aarch64_neon_ld1x4:
10591   case Intrinsic::aarch64_neon_ld2lane:
10592   case Intrinsic::aarch64_neon_ld3lane:
10593   case Intrinsic::aarch64_neon_ld4lane:
10594   case Intrinsic::aarch64_neon_ld2r:
10595   case Intrinsic::aarch64_neon_ld3r:
10596   case Intrinsic::aarch64_neon_ld4r: {
10597     Info.opc = ISD::INTRINSIC_W_CHAIN;
10598     // Conservatively set memVT to the entire set of vectors loaded.
10599     uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
10600     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
10601     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
10602     Info.offset = 0;
10603     Info.align.reset();
10604     // volatile loads with NEON intrinsics not supported
10605     Info.flags = MachineMemOperand::MOLoad;
10606     return true;
10607   }
10608   case Intrinsic::aarch64_neon_st2:
10609   case Intrinsic::aarch64_neon_st3:
10610   case Intrinsic::aarch64_neon_st4:
10611   case Intrinsic::aarch64_neon_st1x2:
10612   case Intrinsic::aarch64_neon_st1x3:
10613   case Intrinsic::aarch64_neon_st1x4:
10614   case Intrinsic::aarch64_neon_st2lane:
10615   case Intrinsic::aarch64_neon_st3lane:
10616   case Intrinsic::aarch64_neon_st4lane: {
10617     Info.opc = ISD::INTRINSIC_VOID;
10618     // Conservatively set memVT to the entire set of vectors stored.
10619     unsigned NumElts = 0;
10620     for (unsigned ArgI = 0, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
10621       Type *ArgTy = I.getArgOperand(ArgI)->getType();
10622       if (!ArgTy->isVectorTy())
10623         break;
10624       NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
10625     }
10626     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
10627     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
10628     Info.offset = 0;
10629     Info.align.reset();
10630     // volatile stores with NEON intrinsics not supported
10631     Info.flags = MachineMemOperand::MOStore;
10632     return true;
10633   }
10634   case Intrinsic::aarch64_ldaxr:
10635   case Intrinsic::aarch64_ldxr: {
10636     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
10637     Info.opc = ISD::INTRINSIC_W_CHAIN;
10638     Info.memVT = MVT::getVT(PtrTy->getElementType());
10639     Info.ptrVal = I.getArgOperand(0);
10640     Info.offset = 0;
10641     Info.align = DL.getABITypeAlign(PtrTy->getElementType());
10642     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
10643     return true;
10644   }
10645   case Intrinsic::aarch64_stlxr:
10646   case Intrinsic::aarch64_stxr: {
10647     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
10648     Info.opc = ISD::INTRINSIC_W_CHAIN;
10649     Info.memVT = MVT::getVT(PtrTy->getElementType());
10650     Info.ptrVal = I.getArgOperand(1);
10651     Info.offset = 0;
10652     Info.align = DL.getABITypeAlign(PtrTy->getElementType());
10653     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
10654     return true;
10655   }
10656   case Intrinsic::aarch64_ldaxp:
10657   case Intrinsic::aarch64_ldxp:
10658     Info.opc = ISD::INTRINSIC_W_CHAIN;
10659     Info.memVT = MVT::i128;
10660     Info.ptrVal = I.getArgOperand(0);
10661     Info.offset = 0;
10662     Info.align = Align(16);
10663     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
10664     return true;
10665   case Intrinsic::aarch64_stlxp:
10666   case Intrinsic::aarch64_stxp:
10667     Info.opc = ISD::INTRINSIC_W_CHAIN;
10668     Info.memVT = MVT::i128;
10669     Info.ptrVal = I.getArgOperand(2);
10670     Info.offset = 0;
10671     Info.align = Align(16);
10672     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
10673     return true;
10674   case Intrinsic::aarch64_sve_ldnt1: {
10675     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
10676     Info.opc = ISD::INTRINSIC_W_CHAIN;
10677     Info.memVT = MVT::getVT(I.getType());
10678     Info.ptrVal = I.getArgOperand(1);
10679     Info.offset = 0;
10680     Info.align = DL.getABITypeAlign(PtrTy->getElementType());
10681     Info.flags = MachineMemOperand::MOLoad;
10682     if (Intrinsic == Intrinsic::aarch64_sve_ldnt1)
10683       Info.flags |= MachineMemOperand::MONonTemporal;
10684     return true;
10685   }
10686   case Intrinsic::aarch64_sve_stnt1: {
10687     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(2)->getType());
10688     Info.opc = ISD::INTRINSIC_W_CHAIN;
10689     Info.memVT = MVT::getVT(I.getOperand(0)->getType());
10690     Info.ptrVal = I.getArgOperand(2);
10691     Info.offset = 0;
10692     Info.align = DL.getABITypeAlign(PtrTy->getElementType());
10693     Info.flags = MachineMemOperand::MOStore;
10694     if (Intrinsic == Intrinsic::aarch64_sve_stnt1)
10695       Info.flags |= MachineMemOperand::MONonTemporal;
10696     return true;
10697   }
10698   default:
10699     break;
10700   }
10701 
10702   return false;
10703 }
10704 
10705 bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
10706                                                   ISD::LoadExtType ExtTy,
10707                                                   EVT NewVT) const {
10708   // TODO: This may be worth removing. Check regression tests for diffs.
10709   if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
10710     return false;
10711 
10712   // If we're reducing the load width in order to avoid having to use an extra
10713   // instruction to do extension then it's probably a good idea.
10714   if (ExtTy != ISD::NON_EXTLOAD)
10715     return true;
10716   // Don't reduce load width if it would prevent us from combining a shift into
10717   // the offset.
10718   MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
10719   assert(Mem);
10720   const SDValue &Base = Mem->getBasePtr();
10721   if (Base.getOpcode() == ISD::ADD &&
10722       Base.getOperand(1).getOpcode() == ISD::SHL &&
10723       Base.getOperand(1).hasOneUse() &&
10724       Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
10725     // The shift can be combined if it matches the size of the value being
10726     // loaded (and so reducing the width would make it not match).
10727     uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
10728     uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
10729     if (ShiftAmount == Log2_32(LoadBytes))
10730       return false;
10731   }
10732   // We have no reason to disallow reducing the load width, so allow it.
10733   return true;
10734 }
10735 
10736 // Truncations from 64-bit GPR to 32-bit GPR is free.
10737 bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
10738   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
10739     return false;
10740   uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedSize();
10741   uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedSize();
10742   return NumBits1 > NumBits2;
10743 }
10744 bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
10745   if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
10746     return false;
10747   uint64_t NumBits1 = VT1.getFixedSizeInBits();
10748   uint64_t NumBits2 = VT2.getFixedSizeInBits();
10749   return NumBits1 > NumBits2;
10750 }
10751 
10752 /// Check if it is profitable to hoist instruction in then/else to if.
10753 /// Not profitable if I and it's user can form a FMA instruction
10754 /// because we prefer FMSUB/FMADD.
10755 bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
10756   if (I->getOpcode() != Instruction::FMul)
10757     return true;
10758 
10759   if (!I->hasOneUse())
10760     return true;
10761 
10762   Instruction *User = I->user_back();
10763 
10764   if (User &&
10765       !(User->getOpcode() == Instruction::FSub ||
10766         User->getOpcode() == Instruction::FAdd))
10767     return true;
10768 
10769   const TargetOptions &Options = getTargetMachine().Options;
10770   const Function *F = I->getFunction();
10771   const DataLayout &DL = F->getParent()->getDataLayout();
10772   Type *Ty = User->getOperand(0)->getType();
10773 
10774   return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
10775            isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
10776            (Options.AllowFPOpFusion == FPOpFusion::Fast ||
10777             Options.UnsafeFPMath));
10778 }
10779 
10780 // All 32-bit GPR operations implicitly zero the high-half of the corresponding
10781 // 64-bit GPR.
10782 bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
10783   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
10784     return false;
10785   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
10786   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
10787   return NumBits1 == 32 && NumBits2 == 64;
10788 }
10789 bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
10790   if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
10791     return false;
10792   unsigned NumBits1 = VT1.getSizeInBits();
10793   unsigned NumBits2 = VT2.getSizeInBits();
10794   return NumBits1 == 32 && NumBits2 == 64;
10795 }
10796 
10797 bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
10798   EVT VT1 = Val.getValueType();
10799   if (isZExtFree(VT1, VT2)) {
10800     return true;
10801   }
10802 
10803   if (Val.getOpcode() != ISD::LOAD)
10804     return false;
10805 
10806   // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
10807   return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
10808           VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
10809           VT1.getSizeInBits() <= 32);
10810 }
10811 
10812 bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
10813   if (isa<FPExtInst>(Ext))
10814     return false;
10815 
10816   // Vector types are not free.
10817   if (Ext->getType()->isVectorTy())
10818     return false;
10819 
10820   for (const Use &U : Ext->uses()) {
10821     // The extension is free if we can fold it with a left shift in an
10822     // addressing mode or an arithmetic operation: add, sub, and cmp.
10823 
10824     // Is there a shift?
10825     const Instruction *Instr = cast<Instruction>(U.getUser());
10826 
10827     // Is this a constant shift?
10828     switch (Instr->getOpcode()) {
10829     case Instruction::Shl:
10830       if (!isa<ConstantInt>(Instr->getOperand(1)))
10831         return false;
10832       break;
10833     case Instruction::GetElementPtr: {
10834       gep_type_iterator GTI = gep_type_begin(Instr);
10835       auto &DL = Ext->getModule()->getDataLayout();
10836       std::advance(GTI, U.getOperandNo()-1);
10837       Type *IdxTy = GTI.getIndexedType();
10838       // This extension will end up with a shift because of the scaling factor.
10839       // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
10840       // Get the shift amount based on the scaling factor:
10841       // log2(sizeof(IdxTy)) - log2(8).
10842       uint64_t ShiftAmt =
10843         countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy).getFixedSize()) - 3;
10844       // Is the constant foldable in the shift of the addressing mode?
10845       // I.e., shift amount is between 1 and 4 inclusive.
10846       if (ShiftAmt == 0 || ShiftAmt > 4)
10847         return false;
10848       break;
10849     }
10850     case Instruction::Trunc:
10851       // Check if this is a noop.
10852       // trunc(sext ty1 to ty2) to ty1.
10853       if (Instr->getType() == Ext->getOperand(0)->getType())
10854         continue;
10855       LLVM_FALLTHROUGH;
10856     default:
10857       return false;
10858     }
10859 
10860     // At this point we can use the bfm family, so this extension is free
10861     // for that use.
10862   }
10863   return true;
10864 }
10865 
10866 /// Check if both Op1 and Op2 are shufflevector extracts of either the lower
10867 /// or upper half of the vector elements.
10868 static bool areExtractShuffleVectors(Value *Op1, Value *Op2) {
10869   auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
10870     auto *FullTy = FullV->getType();
10871     auto *HalfTy = HalfV->getType();
10872     return FullTy->getPrimitiveSizeInBits().getFixedSize() ==
10873            2 * HalfTy->getPrimitiveSizeInBits().getFixedSize();
10874   };
10875 
10876   auto extractHalf = [](Value *FullV, Value *HalfV) {
10877     auto *FullVT = cast<FixedVectorType>(FullV->getType());
10878     auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
10879     return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
10880   };
10881 
10882   ArrayRef<int> M1, M2;
10883   Value *S1Op1, *S2Op1;
10884   if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
10885       !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
10886     return false;
10887 
10888   // Check that the operands are half as wide as the result and we extract
10889   // half of the elements of the input vectors.
10890   if (!areTypesHalfed(S1Op1, Op1) || !areTypesHalfed(S2Op1, Op2) ||
10891       !extractHalf(S1Op1, Op1) || !extractHalf(S2Op1, Op2))
10892     return false;
10893 
10894   // Check the mask extracts either the lower or upper half of vector
10895   // elements.
10896   int M1Start = -1;
10897   int M2Start = -1;
10898   int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
10899   if (!ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start) ||
10900       !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start) ||
10901       M1Start != M2Start || (M1Start != 0 && M2Start != (NumElements / 2)))
10902     return false;
10903 
10904   return true;
10905 }
10906 
10907 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
10908 /// of the vector elements.
10909 static bool areExtractExts(Value *Ext1, Value *Ext2) {
10910   auto areExtDoubled = [](Instruction *Ext) {
10911     return Ext->getType()->getScalarSizeInBits() ==
10912            2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
10913   };
10914 
10915   if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
10916       !match(Ext2, m_ZExtOrSExt(m_Value())) ||
10917       !areExtDoubled(cast<Instruction>(Ext1)) ||
10918       !areExtDoubled(cast<Instruction>(Ext2)))
10919     return false;
10920 
10921   return true;
10922 }
10923 
10924 /// Check if Op could be used with vmull_high_p64 intrinsic.
10925 static bool isOperandOfVmullHighP64(Value *Op) {
10926   Value *VectorOperand = nullptr;
10927   ConstantInt *ElementIndex = nullptr;
10928   return match(Op, m_ExtractElt(m_Value(VectorOperand),
10929                                 m_ConstantInt(ElementIndex))) &&
10930          ElementIndex->getValue() == 1 &&
10931          isa<FixedVectorType>(VectorOperand->getType()) &&
10932          cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
10933 }
10934 
10935 /// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
10936 static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
10937   return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2);
10938 }
10939 
10940 /// Check if sinking \p I's operands to I's basic block is profitable, because
10941 /// the operands can be folded into a target instruction, e.g.
10942 /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
10943 bool AArch64TargetLowering::shouldSinkOperands(
10944     Instruction *I, SmallVectorImpl<Use *> &Ops) const {
10945   if (!I->getType()->isVectorTy())
10946     return false;
10947 
10948   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
10949     switch (II->getIntrinsicID()) {
10950     case Intrinsic::aarch64_neon_umull:
10951       if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
10952         return false;
10953       Ops.push_back(&II->getOperandUse(0));
10954       Ops.push_back(&II->getOperandUse(1));
10955       return true;
10956 
10957     case Intrinsic::aarch64_neon_pmull64:
10958       if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
10959                                      II->getArgOperand(1)))
10960         return false;
10961       Ops.push_back(&II->getArgOperandUse(0));
10962       Ops.push_back(&II->getArgOperandUse(1));
10963       return true;
10964 
10965     default:
10966       return false;
10967     }
10968   }
10969 
10970   switch (I->getOpcode()) {
10971   case Instruction::Sub:
10972   case Instruction::Add: {
10973     if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
10974       return false;
10975 
10976     // If the exts' operands extract either the lower or upper elements, we
10977     // can sink them too.
10978     auto Ext1 = cast<Instruction>(I->getOperand(0));
10979     auto Ext2 = cast<Instruction>(I->getOperand(1));
10980     if (areExtractShuffleVectors(Ext1, Ext2)) {
10981       Ops.push_back(&Ext1->getOperandUse(0));
10982       Ops.push_back(&Ext2->getOperandUse(0));
10983     }
10984 
10985     Ops.push_back(&I->getOperandUse(0));
10986     Ops.push_back(&I->getOperandUse(1));
10987 
10988     return true;
10989   }
10990   case Instruction::Mul: {
10991     bool IsProfitable = false;
10992     for (auto &Op : I->operands()) {
10993       // Make sure we are not already sinking this operand
10994       if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
10995         continue;
10996 
10997       ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
10998       if (!Shuffle || !Shuffle->isZeroEltSplat())
10999         continue;
11000 
11001       Value *ShuffleOperand = Shuffle->getOperand(0);
11002       InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
11003       if (!Insert)
11004         continue;
11005 
11006       Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
11007       if (!OperandInstr)
11008         continue;
11009 
11010       ConstantInt *ElementConstant =
11011           dyn_cast<ConstantInt>(Insert->getOperand(2));
11012       // Check that the insertelement is inserting into element 0
11013       if (!ElementConstant || ElementConstant->getZExtValue() != 0)
11014         continue;
11015 
11016       unsigned Opcode = OperandInstr->getOpcode();
11017       if (Opcode != Instruction::SExt && Opcode != Instruction::ZExt)
11018         continue;
11019 
11020       Ops.push_back(&Shuffle->getOperandUse(0));
11021       Ops.push_back(&Op);
11022       IsProfitable = true;
11023     }
11024 
11025     return IsProfitable;
11026   }
11027   default:
11028     return false;
11029   }
11030   return false;
11031 }
11032 
11033 bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
11034                                           Align &RequiredAligment) const {
11035   if (!LoadedType.isSimple() ||
11036       (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
11037     return false;
11038   // Cyclone supports unaligned accesses.
11039   RequiredAligment = Align(1);
11040   unsigned NumBits = LoadedType.getSizeInBits();
11041   return NumBits == 32 || NumBits == 64;
11042 }
11043 
11044 /// A helper function for determining the number of interleaved accesses we
11045 /// will generate when lowering accesses of the given type.
11046 unsigned
11047 AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
11048                                                  const DataLayout &DL) const {
11049   return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
11050 }
11051 
11052 MachineMemOperand::Flags
11053 AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const {
11054   if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
11055       I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
11056     return MOStridedAccess;
11057   return MachineMemOperand::MONone;
11058 }
11059 
11060 bool AArch64TargetLowering::isLegalInterleavedAccessType(
11061     VectorType *VecTy, const DataLayout &DL) const {
11062 
11063   unsigned VecSize = DL.getTypeSizeInBits(VecTy);
11064   unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
11065 
11066   // Ensure the number of vector elements is greater than 1.
11067   if (cast<FixedVectorType>(VecTy)->getNumElements() < 2)
11068     return false;
11069 
11070   // Ensure the element type is legal.
11071   if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
11072     return false;
11073 
11074   // Ensure the total vector size is 64 or a multiple of 128. Types larger than
11075   // 128 will be split into multiple interleaved accesses.
11076   return VecSize == 64 || VecSize % 128 == 0;
11077 }
11078 
11079 /// Lower an interleaved load into a ldN intrinsic.
11080 ///
11081 /// E.g. Lower an interleaved load (Factor = 2):
11082 ///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
11083 ///        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
11084 ///        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
11085 ///
11086 ///      Into:
11087 ///        %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
11088 ///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
11089 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
11090 bool AArch64TargetLowering::lowerInterleavedLoad(
11091     LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
11092     ArrayRef<unsigned> Indices, unsigned Factor) const {
11093   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
11094          "Invalid interleave factor");
11095   assert(!Shuffles.empty() && "Empty shufflevector input");
11096   assert(Shuffles.size() == Indices.size() &&
11097          "Unmatched number of shufflevectors and indices");
11098 
11099   const DataLayout &DL = LI->getModule()->getDataLayout();
11100 
11101   VectorType *VTy = Shuffles[0]->getType();
11102 
11103   // Skip if we do not have NEON and skip illegal vector types. We can
11104   // "legalize" wide vector types into multiple interleaved accesses as long as
11105   // the vector types are divisible by 128.
11106   if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VTy, DL))
11107     return false;
11108 
11109   unsigned NumLoads = getNumInterleavedAccesses(VTy, DL);
11110 
11111   auto *FVTy = cast<FixedVectorType>(VTy);
11112 
11113   // A pointer vector can not be the return type of the ldN intrinsics. Need to
11114   // load integer vectors first and then convert to pointer vectors.
11115   Type *EltTy = FVTy->getElementType();
11116   if (EltTy->isPointerTy())
11117     FVTy =
11118         FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
11119 
11120   IRBuilder<> Builder(LI);
11121 
11122   // The base address of the load.
11123   Value *BaseAddr = LI->getPointerOperand();
11124 
11125   if (NumLoads > 1) {
11126     // If we're going to generate more than one load, reset the sub-vector type
11127     // to something legal.
11128     FVTy = FixedVectorType::get(FVTy->getElementType(),
11129                                 FVTy->getNumElements() / NumLoads);
11130 
11131     // We will compute the pointer operand of each load from the original base
11132     // address using GEPs. Cast the base address to a pointer to the scalar
11133     // element type.
11134     BaseAddr = Builder.CreateBitCast(
11135         BaseAddr,
11136         FVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()));
11137   }
11138 
11139   Type *PtrTy = FVTy->getPointerTo(LI->getPointerAddressSpace());
11140   Type *Tys[2] = {FVTy, PtrTy};
11141   static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
11142                                             Intrinsic::aarch64_neon_ld3,
11143                                             Intrinsic::aarch64_neon_ld4};
11144   Function *LdNFunc =
11145       Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
11146 
11147   // Holds sub-vectors extracted from the load intrinsic return values. The
11148   // sub-vectors are associated with the shufflevector instructions they will
11149   // replace.
11150   DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
11151 
11152   for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
11153 
11154     // If we're generating more than one load, compute the base address of
11155     // subsequent loads as an offset from the previous.
11156     if (LoadCount > 0)
11157       BaseAddr = Builder.CreateConstGEP1_32(FVTy->getElementType(), BaseAddr,
11158                                             FVTy->getNumElements() * Factor);
11159 
11160     CallInst *LdN = Builder.CreateCall(
11161         LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN");
11162 
11163     // Extract and store the sub-vectors returned by the load intrinsic.
11164     for (unsigned i = 0; i < Shuffles.size(); i++) {
11165       ShuffleVectorInst *SVI = Shuffles[i];
11166       unsigned Index = Indices[i];
11167 
11168       Value *SubVec = Builder.CreateExtractValue(LdN, Index);
11169 
11170       // Convert the integer vector to pointer vector if the element is pointer.
11171       if (EltTy->isPointerTy())
11172         SubVec = Builder.CreateIntToPtr(
11173             SubVec, FixedVectorType::get(SVI->getType()->getElementType(),
11174                                          FVTy->getNumElements()));
11175       SubVecs[SVI].push_back(SubVec);
11176     }
11177   }
11178 
11179   // Replace uses of the shufflevector instructions with the sub-vectors
11180   // returned by the load intrinsic. If a shufflevector instruction is
11181   // associated with more than one sub-vector, those sub-vectors will be
11182   // concatenated into a single wide vector.
11183   for (ShuffleVectorInst *SVI : Shuffles) {
11184     auto &SubVec = SubVecs[SVI];
11185     auto *WideVec =
11186         SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
11187     SVI->replaceAllUsesWith(WideVec);
11188   }
11189 
11190   return true;
11191 }
11192 
11193 /// Lower an interleaved store into a stN intrinsic.
11194 ///
11195 /// E.g. Lower an interleaved store (Factor = 3):
11196 ///        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
11197 ///                 <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
11198 ///        store <12 x i32> %i.vec, <12 x i32>* %ptr
11199 ///
11200 ///      Into:
11201 ///        %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
11202 ///        %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
11203 ///        %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
11204 ///        call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
11205 ///
11206 /// Note that the new shufflevectors will be removed and we'll only generate one
11207 /// st3 instruction in CodeGen.
11208 ///
11209 /// Example for a more general valid mask (Factor 3). Lower:
11210 ///        %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
11211 ///                 <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
11212 ///        store <12 x i32> %i.vec, <12 x i32>* %ptr
11213 ///
11214 ///      Into:
11215 ///        %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
11216 ///        %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
11217 ///        %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
11218 ///        call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
11219 bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
11220                                                   ShuffleVectorInst *SVI,
11221                                                   unsigned Factor) const {
11222   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
11223          "Invalid interleave factor");
11224 
11225   auto *VecTy = cast<FixedVectorType>(SVI->getType());
11226   assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
11227 
11228   unsigned LaneLen = VecTy->getNumElements() / Factor;
11229   Type *EltTy = VecTy->getElementType();
11230   auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
11231 
11232   const DataLayout &DL = SI->getModule()->getDataLayout();
11233 
11234   // Skip if we do not have NEON and skip illegal vector types. We can
11235   // "legalize" wide vector types into multiple interleaved accesses as long as
11236   // the vector types are divisible by 128.
11237   if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL))
11238     return false;
11239 
11240   unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
11241 
11242   Value *Op0 = SVI->getOperand(0);
11243   Value *Op1 = SVI->getOperand(1);
11244   IRBuilder<> Builder(SI);
11245 
11246   // StN intrinsics don't support pointer vectors as arguments. Convert pointer
11247   // vectors to integer vectors.
11248   if (EltTy->isPointerTy()) {
11249     Type *IntTy = DL.getIntPtrType(EltTy);
11250     unsigned NumOpElts =
11251         cast<FixedVectorType>(Op0->getType())->getNumElements();
11252 
11253     // Convert to the corresponding integer vector.
11254     auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
11255     Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
11256     Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
11257 
11258     SubVecTy = FixedVectorType::get(IntTy, LaneLen);
11259   }
11260 
11261   // The base address of the store.
11262   Value *BaseAddr = SI->getPointerOperand();
11263 
11264   if (NumStores > 1) {
11265     // If we're going to generate more than one store, reset the lane length
11266     // and sub-vector type to something legal.
11267     LaneLen /= NumStores;
11268     SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
11269 
11270     // We will compute the pointer operand of each store from the original base
11271     // address using GEPs. Cast the base address to a pointer to the scalar
11272     // element type.
11273     BaseAddr = Builder.CreateBitCast(
11274         BaseAddr,
11275         SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace()));
11276   }
11277 
11278   auto Mask = SVI->getShuffleMask();
11279 
11280   Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
11281   Type *Tys[2] = {SubVecTy, PtrTy};
11282   static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2,
11283                                              Intrinsic::aarch64_neon_st3,
11284                                              Intrinsic::aarch64_neon_st4};
11285   Function *StNFunc =
11286       Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
11287 
11288   for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
11289 
11290     SmallVector<Value *, 5> Ops;
11291 
11292     // Split the shufflevector operands into sub vectors for the new stN call.
11293     for (unsigned i = 0; i < Factor; i++) {
11294       unsigned IdxI = StoreCount * LaneLen * Factor + i;
11295       if (Mask[IdxI] >= 0) {
11296         Ops.push_back(Builder.CreateShuffleVector(
11297             Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
11298       } else {
11299         unsigned StartMask = 0;
11300         for (unsigned j = 1; j < LaneLen; j++) {
11301           unsigned IdxJ = StoreCount * LaneLen * Factor + j;
11302           if (Mask[IdxJ * Factor + IdxI] >= 0) {
11303             StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
11304             break;
11305           }
11306         }
11307         // Note: Filling undef gaps with random elements is ok, since
11308         // those elements were being written anyway (with undefs).
11309         // In the case of all undefs we're defaulting to using elems from 0
11310         // Note: StartMask cannot be negative, it's checked in
11311         // isReInterleaveMask
11312         Ops.push_back(Builder.CreateShuffleVector(
11313             Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
11314       }
11315     }
11316 
11317     // If we generating more than one store, we compute the base address of
11318     // subsequent stores as an offset from the previous.
11319     if (StoreCount > 0)
11320       BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
11321                                             BaseAddr, LaneLen * Factor);
11322 
11323     Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
11324     Builder.CreateCall(StNFunc, Ops);
11325   }
11326   return true;
11327 }
11328 
11329 // Lower an SVE structured load intrinsic returning a tuple type to target
11330 // specific intrinsic taking the same input but returning a multi-result value
11331 // of the split tuple type.
11332 //
11333 // E.g. Lowering an LD3:
11334 //
11335 //  call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32(
11336 //                                                    <vscale x 4 x i1> %pred,
11337 //                                                    <vscale x 4 x i32>* %addr)
11338 //
11339 //  Output DAG:
11340 //
11341 //    t0: ch = EntryToken
11342 //        t2: nxv4i1,ch = CopyFromReg t0, Register:nxv4i1 %0
11343 //        t4: i64,ch = CopyFromReg t0, Register:i64 %1
11344 //    t5: nxv4i32,nxv4i32,nxv4i32,ch = AArch64ISD::SVE_LD3 t0, t2, t4
11345 //    t6: nxv12i32 = concat_vectors t5, t5:1, t5:2
11346 //
11347 // This is called pre-legalization to avoid widening/splitting issues with
11348 // non-power-of-2 tuple types used for LD3, such as nxv12i32.
11349 SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic,
11350                                                   ArrayRef<SDValue> LoadOps,
11351                                                   EVT VT, SelectionDAG &DAG,
11352                                                   const SDLoc &DL) const {
11353   assert(VT.isScalableVector() && "Can only lower scalable vectors");
11354 
11355   unsigned N, Opcode;
11356   static std::map<unsigned, std::pair<unsigned, unsigned>> IntrinsicMap = {
11357       {Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2_MERGE_ZERO}},
11358       {Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3_MERGE_ZERO}},
11359       {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}};
11360 
11361   std::tie(N, Opcode) = IntrinsicMap[Intrinsic];
11362   assert(VT.getVectorElementCount().getKnownMinValue() % N == 0 &&
11363          "invalid tuple vector type!");
11364 
11365   EVT SplitVT =
11366       EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
11367                        VT.getVectorElementCount().divideCoefficientBy(N));
11368   assert(isTypeLegal(SplitVT));
11369 
11370   SmallVector<EVT, 5> VTs(N, SplitVT);
11371   VTs.push_back(MVT::Other); // Chain
11372   SDVTList NodeTys = DAG.getVTList(VTs);
11373 
11374   SDValue PseudoLoad = DAG.getNode(Opcode, DL, NodeTys, LoadOps);
11375   SmallVector<SDValue, 4> PseudoLoadOps;
11376   for (unsigned I = 0; I < N; ++I)
11377     PseudoLoadOps.push_back(SDValue(PseudoLoad.getNode(), I));
11378   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, PseudoLoadOps);
11379 }
11380 
11381 EVT AArch64TargetLowering::getOptimalMemOpType(
11382     const MemOp &Op, const AttributeList &FuncAttributes) const {
11383   bool CanImplicitFloat =
11384       !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
11385   bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
11386   bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
11387   // Only use AdvSIMD to implement memset of 32-byte and above. It would have
11388   // taken one instruction to materialize the v2i64 zero and one store (with
11389   // restrictive addressing mode). Just do i64 stores.
11390   bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
11391   auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
11392     if (Op.isAligned(AlignCheck))
11393       return true;
11394     bool Fast;
11395     return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone,
11396                                           &Fast) &&
11397            Fast;
11398   };
11399 
11400   if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
11401       AlignmentIsAcceptable(MVT::v2i64, Align(16)))
11402     return MVT::v2i64;
11403   if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
11404     return MVT::f128;
11405   if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
11406     return MVT::i64;
11407   if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
11408     return MVT::i32;
11409   return MVT::Other;
11410 }
11411 
11412 LLT AArch64TargetLowering::getOptimalMemOpLLT(
11413     const MemOp &Op, const AttributeList &FuncAttributes) const {
11414   bool CanImplicitFloat =
11415       !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
11416   bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
11417   bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
11418   // Only use AdvSIMD to implement memset of 32-byte and above. It would have
11419   // taken one instruction to materialize the v2i64 zero and one store (with
11420   // restrictive addressing mode). Just do i64 stores.
11421   bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
11422   auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
11423     if (Op.isAligned(AlignCheck))
11424       return true;
11425     bool Fast;
11426     return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone,
11427                                           &Fast) &&
11428            Fast;
11429   };
11430 
11431   if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
11432       AlignmentIsAcceptable(MVT::v2i64, Align(16)))
11433     return LLT::vector(2, 64);
11434   if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
11435     return LLT::scalar(128);
11436   if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
11437     return LLT::scalar(64);
11438   if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
11439     return LLT::scalar(32);
11440   return LLT();
11441 }
11442 
11443 // 12-bit optionally shifted immediates are legal for adds.
11444 bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
11445   if (Immed == std::numeric_limits<int64_t>::min()) {
11446     LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
11447                       << ": avoid UB for INT64_MIN\n");
11448     return false;
11449   }
11450   // Same encoding for add/sub, just flip the sign.
11451   Immed = std::abs(Immed);
11452   bool IsLegal = ((Immed >> 12) == 0 ||
11453                   ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
11454   LLVM_DEBUG(dbgs() << "Is " << Immed
11455                     << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
11456   return IsLegal;
11457 }
11458 
11459 // Integer comparisons are implemented with ADDS/SUBS, so the range of valid
11460 // immediates is the same as for an add or a sub.
11461 bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
11462   return isLegalAddImmediate(Immed);
11463 }
11464 
11465 /// isLegalAddressingMode - Return true if the addressing mode represented
11466 /// by AM is legal for this target, for a load/store of the specified type.
11467 bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
11468                                                   const AddrMode &AM, Type *Ty,
11469                                                   unsigned AS, Instruction *I) const {
11470   // AArch64 has five basic addressing modes:
11471   //  reg
11472   //  reg + 9-bit signed offset
11473   //  reg + SIZE_IN_BYTES * 12-bit unsigned offset
11474   //  reg1 + reg2
11475   //  reg + SIZE_IN_BYTES * reg
11476 
11477   // No global is ever allowed as a base.
11478   if (AM.BaseGV)
11479     return false;
11480 
11481   // No reg+reg+imm addressing.
11482   if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
11483     return false;
11484 
11485   // FIXME: Update this method to support scalable addressing modes.
11486   if (isa<ScalableVectorType>(Ty))
11487     return AM.HasBaseReg && !AM.BaseOffs && !AM.Scale;
11488 
11489   // check reg + imm case:
11490   // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
11491   uint64_t NumBytes = 0;
11492   if (Ty->isSized()) {
11493     uint64_t NumBits = DL.getTypeSizeInBits(Ty);
11494     NumBytes = NumBits / 8;
11495     if (!isPowerOf2_64(NumBits))
11496       NumBytes = 0;
11497   }
11498 
11499   if (!AM.Scale) {
11500     int64_t Offset = AM.BaseOffs;
11501 
11502     // 9-bit signed offset
11503     if (isInt<9>(Offset))
11504       return true;
11505 
11506     // 12-bit unsigned offset
11507     unsigned shift = Log2_64(NumBytes);
11508     if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
11509         // Must be a multiple of NumBytes (NumBytes is a power of 2)
11510         (Offset >> shift) << shift == Offset)
11511       return true;
11512     return false;
11513   }
11514 
11515   // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
11516 
11517   return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
11518 }
11519 
11520 bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {
11521   // Consider splitting large offset of struct or array.
11522   return true;
11523 }
11524 
11525 int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL,
11526                                                 const AddrMode &AM, Type *Ty,
11527                                                 unsigned AS) const {
11528   // Scaling factors are not free at all.
11529   // Operands                     | Rt Latency
11530   // -------------------------------------------
11531   // Rt, [Xn, Xm]                 | 4
11532   // -------------------------------------------
11533   // Rt, [Xn, Xm, lsl #imm]       | Rn: 4 Rm: 5
11534   // Rt, [Xn, Wm, <extend> #imm]  |
11535   if (isLegalAddressingMode(DL, AM, Ty, AS))
11536     // Scale represents reg2 * scale, thus account for 1 if
11537     // it is not equal to 0 or 1.
11538     return AM.Scale != 0 && AM.Scale != 1;
11539   return -1;
11540 }
11541 
11542 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(
11543     const MachineFunction &MF, EVT VT) const {
11544   VT = VT.getScalarType();
11545 
11546   if (!VT.isSimple())
11547     return false;
11548 
11549   switch (VT.getSimpleVT().SimpleTy) {
11550   case MVT::f32:
11551   case MVT::f64:
11552     return true;
11553   default:
11554     break;
11555   }
11556 
11557   return false;
11558 }
11559 
11560 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
11561                                                        Type *Ty) const {
11562   switch (Ty->getScalarType()->getTypeID()) {
11563   case Type::FloatTyID:
11564   case Type::DoubleTyID:
11565     return true;
11566   default:
11567     return false;
11568   }
11569 }
11570 
11571 const MCPhysReg *
11572 AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
11573   // LR is a callee-save register, but we must treat it as clobbered by any call
11574   // site. Hence we include LR in the scratch registers, which are in turn added
11575   // as implicit-defs for stackmaps and patchpoints.
11576   static const MCPhysReg ScratchRegs[] = {
11577     AArch64::X16, AArch64::X17, AArch64::LR, 0
11578   };
11579   return ScratchRegs;
11580 }
11581 
11582 bool
11583 AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
11584                                                      CombineLevel Level) const {
11585   N = N->getOperand(0).getNode();
11586   EVT VT = N->getValueType(0);
11587     // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
11588     // it with shift to let it be lowered to UBFX.
11589   if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
11590       isa<ConstantSDNode>(N->getOperand(1))) {
11591     uint64_t TruncMask = N->getConstantOperandVal(1);
11592     if (isMask_64(TruncMask) &&
11593       N->getOperand(0).getOpcode() == ISD::SRL &&
11594       isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
11595       return false;
11596   }
11597   return true;
11598 }
11599 
11600 bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
11601                                                               Type *Ty) const {
11602   assert(Ty->isIntegerTy());
11603 
11604   unsigned BitSize = Ty->getPrimitiveSizeInBits();
11605   if (BitSize == 0)
11606     return false;
11607 
11608   int64_t Val = Imm.getSExtValue();
11609   if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
11610     return true;
11611 
11612   if ((int64_t)Val < 0)
11613     Val = ~Val;
11614   if (BitSize == 32)
11615     Val &= (1LL << 32) - 1;
11616 
11617   unsigned LZ = countLeadingZeros((uint64_t)Val);
11618   unsigned Shift = (63 - LZ) / 16;
11619   // MOVZ is free so return true for one or fewer MOVK.
11620   return Shift < 3;
11621 }
11622 
11623 bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
11624                                                     unsigned Index) const {
11625   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
11626     return false;
11627 
11628   return (Index == 0 || Index == ResVT.getVectorNumElements());
11629 }
11630 
11631 /// Turn vector tests of the signbit in the form of:
11632 ///   xor (sra X, elt_size(X)-1), -1
11633 /// into:
11634 ///   cmge X, X, #0
11635 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
11636                                          const AArch64Subtarget *Subtarget) {
11637   EVT VT = N->getValueType(0);
11638   if (!Subtarget->hasNEON() || !VT.isVector())
11639     return SDValue();
11640 
11641   // There must be a shift right algebraic before the xor, and the xor must be a
11642   // 'not' operation.
11643   SDValue Shift = N->getOperand(0);
11644   SDValue Ones = N->getOperand(1);
11645   if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
11646       !ISD::isBuildVectorAllOnes(Ones.getNode()))
11647     return SDValue();
11648 
11649   // The shift should be smearing the sign bit across each vector element.
11650   auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
11651   EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
11652   if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
11653     return SDValue();
11654 
11655   return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
11656 }
11657 
11658 // VECREDUCE_ADD( EXTEND(v16i8_type) ) to
11659 // VECREDUCE_ADD( DOTv16i8(v16i8_type) )
11660 static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
11661                                           const AArch64Subtarget *ST) {
11662   SDValue Op0 = N->getOperand(0);
11663   if (!ST->hasDotProd() || N->getValueType(0) != MVT::i32)
11664     return SDValue();
11665 
11666   if (Op0.getValueType().getVectorElementType() != MVT::i32)
11667     return SDValue();
11668 
11669   unsigned ExtOpcode = Op0.getOpcode();
11670   if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
11671     return SDValue();
11672 
11673   EVT Op0VT = Op0.getOperand(0).getValueType();
11674   if (Op0VT != MVT::v16i8)
11675     return SDValue();
11676 
11677   SDLoc DL(Op0);
11678   SDValue Ones = DAG.getConstant(1, DL, Op0VT);
11679   SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
11680   auto DotIntrisic = (ExtOpcode == ISD::ZERO_EXTEND)
11681                          ? Intrinsic::aarch64_neon_udot
11682                          : Intrinsic::aarch64_neon_sdot;
11683   SDValue Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Zeros.getValueType(),
11684                             DAG.getConstant(DotIntrisic, DL, MVT::i32), Zeros,
11685                             Ones, Op0.getOperand(0));
11686   return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
11687 }
11688 
11689 // Given a ABS node, detect the following pattern:
11690 // (ABS (SUB (EXTEND a), (EXTEND b))).
11691 // Generates UABD/SABD instruction.
11692 static SDValue performABSCombine(SDNode *N, SelectionDAG &DAG,
11693                                  TargetLowering::DAGCombinerInfo &DCI,
11694                                  const AArch64Subtarget *Subtarget) {
11695   SDValue AbsOp1 = N->getOperand(0);
11696   SDValue Op0, Op1;
11697 
11698   if (AbsOp1.getOpcode() != ISD::SUB)
11699     return SDValue();
11700 
11701   Op0 = AbsOp1.getOperand(0);
11702   Op1 = AbsOp1.getOperand(1);
11703 
11704   unsigned Opc0 = Op0.getOpcode();
11705   // Check if the operands of the sub are (zero|sign)-extended.
11706   if (Opc0 != Op1.getOpcode() ||
11707       (Opc0 != ISD::ZERO_EXTEND && Opc0 != ISD::SIGN_EXTEND))
11708     return SDValue();
11709 
11710   EVT VectorT1 = Op0.getOperand(0).getValueType();
11711   EVT VectorT2 = Op1.getOperand(0).getValueType();
11712   // Check if vectors are of same type and valid size.
11713   uint64_t Size = VectorT1.getFixedSizeInBits();
11714   if (VectorT1 != VectorT2 || (Size != 64 && Size != 128))
11715     return SDValue();
11716 
11717   // Check if vector element types are valid.
11718   EVT VT1 = VectorT1.getVectorElementType();
11719   if (VT1 != MVT::i8 && VT1 != MVT::i16 && VT1 != MVT::i32)
11720     return SDValue();
11721 
11722   Op0 = Op0.getOperand(0);
11723   Op1 = Op1.getOperand(0);
11724   unsigned ABDOpcode =
11725       (Opc0 == ISD::SIGN_EXTEND) ? AArch64ISD::SABD : AArch64ISD::UABD;
11726   SDValue ABD =
11727       DAG.getNode(ABDOpcode, SDLoc(N), Op0->getValueType(0), Op0, Op1);
11728   return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), ABD);
11729 }
11730 
11731 static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
11732                                  TargetLowering::DAGCombinerInfo &DCI,
11733                                  const AArch64Subtarget *Subtarget) {
11734   if (DCI.isBeforeLegalizeOps())
11735     return SDValue();
11736 
11737   return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
11738 }
11739 
11740 SDValue
11741 AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
11742                                      SelectionDAG &DAG,
11743                                      SmallVectorImpl<SDNode *> &Created) const {
11744   AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
11745   if (isIntDivCheap(N->getValueType(0), Attr))
11746     return SDValue(N,0); // Lower SDIV as SDIV
11747 
11748   // fold (sdiv X, pow2)
11749   EVT VT = N->getValueType(0);
11750   if ((VT != MVT::i32 && VT != MVT::i64) ||
11751       !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()))
11752     return SDValue();
11753 
11754   SDLoc DL(N);
11755   SDValue N0 = N->getOperand(0);
11756   unsigned Lg2 = Divisor.countTrailingZeros();
11757   SDValue Zero = DAG.getConstant(0, DL, VT);
11758   SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
11759 
11760   // Add (N0 < 0) ? Pow2 - 1 : 0;
11761   SDValue CCVal;
11762   SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
11763   SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
11764   SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);
11765 
11766   Created.push_back(Cmp.getNode());
11767   Created.push_back(Add.getNode());
11768   Created.push_back(CSel.getNode());
11769 
11770   // Divide by pow2.
11771   SDValue SRA =
11772       DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64));
11773 
11774   // If we're dividing by a positive value, we're done.  Otherwise, we must
11775   // negate the result.
11776   if (Divisor.isNonNegative())
11777     return SRA;
11778 
11779   Created.push_back(SRA.getNode());
11780   return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
11781 }
11782 
11783 static bool IsSVECntIntrinsic(SDValue S) {
11784   switch(getIntrinsicID(S.getNode())) {
11785   default:
11786     break;
11787   case Intrinsic::aarch64_sve_cntb:
11788   case Intrinsic::aarch64_sve_cnth:
11789   case Intrinsic::aarch64_sve_cntw:
11790   case Intrinsic::aarch64_sve_cntd:
11791     return true;
11792   }
11793   return false;
11794 }
11795 
11796 /// Calculates what the pre-extend type is, based on the extension
11797 /// operation node provided by \p Extend.
11798 ///
11799 /// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
11800 /// pre-extend type is pulled directly from the operand, while other extend
11801 /// operations need a bit more inspection to get this information.
11802 ///
11803 /// \param Extend The SDNode from the DAG that represents the extend operation
11804 /// \param DAG The SelectionDAG hosting the \p Extend node
11805 ///
11806 /// \returns The type representing the \p Extend source type, or \p MVT::Other
11807 /// if no valid type can be determined
11808 static EVT calculatePreExtendType(SDValue Extend, SelectionDAG &DAG) {
11809   switch (Extend.getOpcode()) {
11810   case ISD::SIGN_EXTEND:
11811   case ISD::ZERO_EXTEND:
11812     return Extend.getOperand(0).getValueType();
11813   case ISD::AssertSext:
11814   case ISD::AssertZext:
11815   case ISD::SIGN_EXTEND_INREG: {
11816     VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
11817     if (!TypeNode)
11818       return MVT::Other;
11819     return TypeNode->getVT();
11820   }
11821   case ISD::AND: {
11822     ConstantSDNode *Constant =
11823         dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
11824     if (!Constant)
11825       return MVT::Other;
11826 
11827     uint32_t Mask = Constant->getZExtValue();
11828 
11829     if (Mask == UCHAR_MAX)
11830       return MVT::i8;
11831     else if (Mask == USHRT_MAX)
11832       return MVT::i16;
11833     else if (Mask == UINT_MAX)
11834       return MVT::i32;
11835 
11836     return MVT::Other;
11837   }
11838   default:
11839     return MVT::Other;
11840   }
11841 
11842   llvm_unreachable("Code path unhandled in calculatePreExtendType!");
11843 }
11844 
11845 /// Combines a dup(sext/zext) node pattern into sext/zext(dup)
11846 /// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
11847 static SDValue performCommonVectorExtendCombine(SDValue VectorShuffle,
11848                                                 SelectionDAG &DAG) {
11849 
11850   ShuffleVectorSDNode *ShuffleNode =
11851       dyn_cast<ShuffleVectorSDNode>(VectorShuffle.getNode());
11852   if (!ShuffleNode)
11853     return SDValue();
11854 
11855   // Ensuring the mask is zero before continuing
11856   if (!ShuffleNode->isSplat() || ShuffleNode->getSplatIndex() != 0)
11857     return SDValue();
11858 
11859   SDValue InsertVectorElt = VectorShuffle.getOperand(0);
11860 
11861   if (InsertVectorElt.getOpcode() != ISD::INSERT_VECTOR_ELT)
11862     return SDValue();
11863 
11864   SDValue InsertLane = InsertVectorElt.getOperand(2);
11865   ConstantSDNode *Constant = dyn_cast<ConstantSDNode>(InsertLane.getNode());
11866   // Ensures the insert is inserting into lane 0
11867   if (!Constant || Constant->getZExtValue() != 0)
11868     return SDValue();
11869 
11870   SDValue Extend = InsertVectorElt.getOperand(1);
11871   unsigned ExtendOpcode = Extend.getOpcode();
11872 
11873   bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
11874                 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
11875                 ExtendOpcode == ISD::AssertSext;
11876   if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
11877       ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
11878     return SDValue();
11879 
11880   EVT TargetType = VectorShuffle.getValueType();
11881   EVT PreExtendType = calculatePreExtendType(Extend, DAG);
11882 
11883   if ((TargetType != MVT::v8i16 && TargetType != MVT::v4i32 &&
11884        TargetType != MVT::v2i64) ||
11885       (PreExtendType == MVT::Other))
11886     return SDValue();
11887 
11888   // Restrict valid pre-extend data type
11889   if (PreExtendType != MVT::i8 && PreExtendType != MVT::i16 &&
11890       PreExtendType != MVT::i32)
11891     return SDValue();
11892 
11893   EVT PreExtendVT = TargetType.changeVectorElementType(PreExtendType);
11894 
11895   if (PreExtendVT.getVectorElementCount() != TargetType.getVectorElementCount())
11896     return SDValue();
11897 
11898   if (TargetType.getScalarSizeInBits() != PreExtendVT.getScalarSizeInBits() * 2)
11899     return SDValue();
11900 
11901   SDLoc DL(VectorShuffle);
11902 
11903   SDValue InsertVectorNode = DAG.getNode(
11904       InsertVectorElt.getOpcode(), DL, PreExtendVT, DAG.getUNDEF(PreExtendVT),
11905       DAG.getAnyExtOrTrunc(Extend.getOperand(0), DL, PreExtendType),
11906       DAG.getConstant(0, DL, MVT::i64));
11907 
11908   std::vector<int> ShuffleMask(TargetType.getVectorElementCount().getValue());
11909 
11910   SDValue VectorShuffleNode =
11911       DAG.getVectorShuffle(PreExtendVT, DL, InsertVectorNode,
11912                            DAG.getUNDEF(PreExtendVT), ShuffleMask);
11913 
11914   SDValue ExtendNode = DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
11915                                    DL, TargetType, VectorShuffleNode);
11916 
11917   return ExtendNode;
11918 }
11919 
11920 /// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
11921 /// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
11922 static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
11923   // If the value type isn't a vector, none of the operands are going to be dups
11924   if (!Mul->getValueType(0).isVector())
11925     return SDValue();
11926 
11927   SDValue Op0 = performCommonVectorExtendCombine(Mul->getOperand(0), DAG);
11928   SDValue Op1 = performCommonVectorExtendCombine(Mul->getOperand(1), DAG);
11929 
11930   // Neither operands have been changed, don't make any further changes
11931   if (!Op0 && !Op1)
11932     return SDValue();
11933 
11934   SDLoc DL(Mul);
11935   return DAG.getNode(Mul->getOpcode(), DL, Mul->getValueType(0),
11936                      Op0 ? Op0 : Mul->getOperand(0),
11937                      Op1 ? Op1 : Mul->getOperand(1));
11938 }
11939 
11940 static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
11941                                  TargetLowering::DAGCombinerInfo &DCI,
11942                                  const AArch64Subtarget *Subtarget) {
11943 
11944   if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
11945     return Ext;
11946 
11947   if (DCI.isBeforeLegalizeOps())
11948     return SDValue();
11949 
11950   // The below optimizations require a constant RHS.
11951   if (!isa<ConstantSDNode>(N->getOperand(1)))
11952     return SDValue();
11953 
11954   SDValue N0 = N->getOperand(0);
11955   ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1));
11956   const APInt &ConstValue = C->getAPIntValue();
11957 
11958   // Allow the scaling to be folded into the `cnt` instruction by preventing
11959   // the scaling to be obscured here. This makes it easier to pattern match.
11960   if (IsSVECntIntrinsic(N0) ||
11961      (N0->getOpcode() == ISD::TRUNCATE &&
11962       (IsSVECntIntrinsic(N0->getOperand(0)))))
11963        if (ConstValue.sge(1) && ConstValue.sle(16))
11964          return SDValue();
11965 
11966   // Multiplication of a power of two plus/minus one can be done more
11967   // cheaply as as shift+add/sub. For now, this is true unilaterally. If
11968   // future CPUs have a cheaper MADD instruction, this may need to be
11969   // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
11970   // 64-bit is 5 cycles, so this is always a win.
11971   // More aggressively, some multiplications N0 * C can be lowered to
11972   // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
11973   // e.g. 6=3*2=(2+1)*2.
11974   // TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45
11975   // which equals to (1+2)*16-(1+2).
11976   // TrailingZeroes is used to test if the mul can be lowered to
11977   // shift+add+shift.
11978   unsigned TrailingZeroes = ConstValue.countTrailingZeros();
11979   if (TrailingZeroes) {
11980     // Conservatively do not lower to shift+add+shift if the mul might be
11981     // folded into smul or umul.
11982     if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) ||
11983                             isZeroExtended(N0.getNode(), DAG)))
11984       return SDValue();
11985     // Conservatively do not lower to shift+add+shift if the mul might be
11986     // folded into madd or msub.
11987     if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
11988                            N->use_begin()->getOpcode() == ISD::SUB))
11989       return SDValue();
11990   }
11991   // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
11992   // and shift+add+shift.
11993   APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
11994 
11995   unsigned ShiftAmt, AddSubOpc;
11996   // Is the shifted value the LHS operand of the add/sub?
11997   bool ShiftValUseIsN0 = true;
11998   // Do we need to negate the result?
11999   bool NegateResult = false;
12000 
12001   if (ConstValue.isNonNegative()) {
12002     // (mul x, 2^N + 1) => (add (shl x, N), x)
12003     // (mul x, 2^N - 1) => (sub (shl x, N), x)
12004     // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
12005     APInt SCVMinus1 = ShiftedConstValue - 1;
12006     APInt CVPlus1 = ConstValue + 1;
12007     if (SCVMinus1.isPowerOf2()) {
12008       ShiftAmt = SCVMinus1.logBase2();
12009       AddSubOpc = ISD::ADD;
12010     } else if (CVPlus1.isPowerOf2()) {
12011       ShiftAmt = CVPlus1.logBase2();
12012       AddSubOpc = ISD::SUB;
12013     } else
12014       return SDValue();
12015   } else {
12016     // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
12017     // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
12018     APInt CVNegPlus1 = -ConstValue + 1;
12019     APInt CVNegMinus1 = -ConstValue - 1;
12020     if (CVNegPlus1.isPowerOf2()) {
12021       ShiftAmt = CVNegPlus1.logBase2();
12022       AddSubOpc = ISD::SUB;
12023       ShiftValUseIsN0 = false;
12024     } else if (CVNegMinus1.isPowerOf2()) {
12025       ShiftAmt = CVNegMinus1.logBase2();
12026       AddSubOpc = ISD::ADD;
12027       NegateResult = true;
12028     } else
12029       return SDValue();
12030   }
12031 
12032   SDLoc DL(N);
12033   EVT VT = N->getValueType(0);
12034   SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0,
12035                                    DAG.getConstant(ShiftAmt, DL, MVT::i64));
12036 
12037   SDValue AddSubN0 = ShiftValUseIsN0 ? ShiftedVal : N0;
12038   SDValue AddSubN1 = ShiftValUseIsN0 ? N0 : ShiftedVal;
12039   SDValue Res = DAG.getNode(AddSubOpc, DL, VT, AddSubN0, AddSubN1);
12040   assert(!(NegateResult && TrailingZeroes) &&
12041          "NegateResult and TrailingZeroes cannot both be true for now.");
12042   // Negate the result.
12043   if (NegateResult)
12044     return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
12045   // Shift the result.
12046   if (TrailingZeroes)
12047     return DAG.getNode(ISD::SHL, DL, VT, Res,
12048                        DAG.getConstant(TrailingZeroes, DL, MVT::i64));
12049   return Res;
12050 }
12051 
12052 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
12053                                                          SelectionDAG &DAG) {
12054   // Take advantage of vector comparisons producing 0 or -1 in each lane to
12055   // optimize away operation when it's from a constant.
12056   //
12057   // The general transformation is:
12058   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
12059   //       AND(VECTOR_CMP(x,y), constant2)
12060   //    constant2 = UNARYOP(constant)
12061 
12062   // Early exit if this isn't a vector operation, the operand of the
12063   // unary operation isn't a bitwise AND, or if the sizes of the operations
12064   // aren't the same.
12065   EVT VT = N->getValueType(0);
12066   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
12067       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
12068       VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
12069     return SDValue();
12070 
12071   // Now check that the other operand of the AND is a constant. We could
12072   // make the transformation for non-constant splats as well, but it's unclear
12073   // that would be a benefit as it would not eliminate any operations, just
12074   // perform one more step in scalar code before moving to the vector unit.
12075   if (BuildVectorSDNode *BV =
12076           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
12077     // Bail out if the vector isn't a constant.
12078     if (!BV->isConstant())
12079       return SDValue();
12080 
12081     // Everything checks out. Build up the new and improved node.
12082     SDLoc DL(N);
12083     EVT IntVT = BV->getValueType(0);
12084     // Create a new constant of the appropriate type for the transformed
12085     // DAG.
12086     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
12087     // The AND node needs bitcasts to/from an integer vector type around it.
12088     SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
12089     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
12090                                  N->getOperand(0)->getOperand(0), MaskConst);
12091     SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
12092     return Res;
12093   }
12094 
12095   return SDValue();
12096 }
12097 
12098 static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
12099                                      const AArch64Subtarget *Subtarget) {
12100   // First try to optimize away the conversion when it's conditionally from
12101   // a constant. Vectors only.
12102   if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
12103     return Res;
12104 
12105   EVT VT = N->getValueType(0);
12106   if (VT != MVT::f32 && VT != MVT::f64)
12107     return SDValue();
12108 
12109   // Only optimize when the source and destination types have the same width.
12110   if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
12111     return SDValue();
12112 
12113   // If the result of an integer load is only used by an integer-to-float
12114   // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
12115   // This eliminates an "integer-to-vector-move" UOP and improves throughput.
12116   SDValue N0 = N->getOperand(0);
12117   if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
12118       // Do not change the width of a volatile load.
12119       !cast<LoadSDNode>(N0)->isVolatile()) {
12120     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
12121     SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
12122                                LN0->getPointerInfo(), LN0->getAlignment(),
12123                                LN0->getMemOperand()->getFlags());
12124 
12125     // Make sure successors of the original load stay after it by updating them
12126     // to use the new Chain.
12127     DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
12128 
12129     unsigned Opcode =
12130         (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
12131     return DAG.getNode(Opcode, SDLoc(N), VT, Load);
12132   }
12133 
12134   return SDValue();
12135 }
12136 
12137 /// Fold a floating-point multiply by power of two into floating-point to
12138 /// fixed-point conversion.
12139 static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
12140                                      TargetLowering::DAGCombinerInfo &DCI,
12141                                      const AArch64Subtarget *Subtarget) {
12142   if (!Subtarget->hasNEON())
12143     return SDValue();
12144 
12145   if (!N->getValueType(0).isSimple())
12146     return SDValue();
12147 
12148   SDValue Op = N->getOperand(0);
12149   if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
12150       Op.getOpcode() != ISD::FMUL)
12151     return SDValue();
12152 
12153   SDValue ConstVec = Op->getOperand(1);
12154   if (!isa<BuildVectorSDNode>(ConstVec))
12155     return SDValue();
12156 
12157   MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
12158   uint32_t FloatBits = FloatTy.getSizeInBits();
12159   if (FloatBits != 32 && FloatBits != 64)
12160     return SDValue();
12161 
12162   MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
12163   uint32_t IntBits = IntTy.getSizeInBits();
12164   if (IntBits != 16 && IntBits != 32 && IntBits != 64)
12165     return SDValue();
12166 
12167   // Avoid conversions where iN is larger than the float (e.g., float -> i64).
12168   if (IntBits > FloatBits)
12169     return SDValue();
12170 
12171   BitVector UndefElements;
12172   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
12173   int32_t Bits = IntBits == 64 ? 64 : 32;
12174   int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
12175   if (C == -1 || C == 0 || C > Bits)
12176     return SDValue();
12177 
12178   MVT ResTy;
12179   unsigned NumLanes = Op.getValueType().getVectorNumElements();
12180   switch (NumLanes) {
12181   default:
12182     return SDValue();
12183   case 2:
12184     ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
12185     break;
12186   case 4:
12187     ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
12188     break;
12189   }
12190 
12191   if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
12192     return SDValue();
12193 
12194   assert((ResTy != MVT::v4i64 || DCI.isBeforeLegalizeOps()) &&
12195          "Illegal vector type after legalization");
12196 
12197   SDLoc DL(N);
12198   bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
12199   unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
12200                                       : Intrinsic::aarch64_neon_vcvtfp2fxu;
12201   SDValue FixConv =
12202       DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
12203                   DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
12204                   Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
12205   // We can handle smaller integers by generating an extra trunc.
12206   if (IntBits < FloatBits)
12207     FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
12208 
12209   return FixConv;
12210 }
12211 
12212 /// Fold a floating-point divide by power of two into fixed-point to
12213 /// floating-point conversion.
12214 static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
12215                                   TargetLowering::DAGCombinerInfo &DCI,
12216                                   const AArch64Subtarget *Subtarget) {
12217   if (!Subtarget->hasNEON())
12218     return SDValue();
12219 
12220   SDValue Op = N->getOperand(0);
12221   unsigned Opc = Op->getOpcode();
12222   if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
12223       !Op.getOperand(0).getValueType().isSimple() ||
12224       (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
12225     return SDValue();
12226 
12227   SDValue ConstVec = N->getOperand(1);
12228   if (!isa<BuildVectorSDNode>(ConstVec))
12229     return SDValue();
12230 
12231   MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
12232   int32_t IntBits = IntTy.getSizeInBits();
12233   if (IntBits != 16 && IntBits != 32 && IntBits != 64)
12234     return SDValue();
12235 
12236   MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
12237   int32_t FloatBits = FloatTy.getSizeInBits();
12238   if (FloatBits != 32 && FloatBits != 64)
12239     return SDValue();
12240 
12241   // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
12242   if (IntBits > FloatBits)
12243     return SDValue();
12244 
12245   BitVector UndefElements;
12246   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
12247   int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
12248   if (C == -1 || C == 0 || C > FloatBits)
12249     return SDValue();
12250 
12251   MVT ResTy;
12252   unsigned NumLanes = Op.getValueType().getVectorNumElements();
12253   switch (NumLanes) {
12254   default:
12255     return SDValue();
12256   case 2:
12257     ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
12258     break;
12259   case 4:
12260     ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
12261     break;
12262   }
12263 
12264   if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
12265     return SDValue();
12266 
12267   SDLoc DL(N);
12268   SDValue ConvInput = Op.getOperand(0);
12269   bool IsSigned = Opc == ISD::SINT_TO_FP;
12270   if (IntBits < FloatBits)
12271     ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
12272                             ResTy, ConvInput);
12273 
12274   unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
12275                                       : Intrinsic::aarch64_neon_vcvtfxu2fp;
12276   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
12277                      DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
12278                      DAG.getConstant(C, DL, MVT::i32));
12279 }
12280 
12281 /// An EXTR instruction is made up of two shifts, ORed together. This helper
12282 /// searches for and classifies those shifts.
12283 static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
12284                          bool &FromHi) {
12285   if (N.getOpcode() == ISD::SHL)
12286     FromHi = false;
12287   else if (N.getOpcode() == ISD::SRL)
12288     FromHi = true;
12289   else
12290     return false;
12291 
12292   if (!isa<ConstantSDNode>(N.getOperand(1)))
12293     return false;
12294 
12295   ShiftAmount = N->getConstantOperandVal(1);
12296   Src = N->getOperand(0);
12297   return true;
12298 }
12299 
12300 /// EXTR instruction extracts a contiguous chunk of bits from two existing
12301 /// registers viewed as a high/low pair. This function looks for the pattern:
12302 /// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it
12303 /// with an EXTR. Can't quite be done in TableGen because the two immediates
12304 /// aren't independent.
12305 static SDValue tryCombineToEXTR(SDNode *N,
12306                                 TargetLowering::DAGCombinerInfo &DCI) {
12307   SelectionDAG &DAG = DCI.DAG;
12308   SDLoc DL(N);
12309   EVT VT = N->getValueType(0);
12310 
12311   assert(N->getOpcode() == ISD::OR && "Unexpected root");
12312 
12313   if (VT != MVT::i32 && VT != MVT::i64)
12314     return SDValue();
12315 
12316   SDValue LHS;
12317   uint32_t ShiftLHS = 0;
12318   bool LHSFromHi = false;
12319   if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
12320     return SDValue();
12321 
12322   SDValue RHS;
12323   uint32_t ShiftRHS = 0;
12324   bool RHSFromHi = false;
12325   if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
12326     return SDValue();
12327 
12328   // If they're both trying to come from the high part of the register, they're
12329   // not really an EXTR.
12330   if (LHSFromHi == RHSFromHi)
12331     return SDValue();
12332 
12333   if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
12334     return SDValue();
12335 
12336   if (LHSFromHi) {
12337     std::swap(LHS, RHS);
12338     std::swap(ShiftLHS, ShiftRHS);
12339   }
12340 
12341   return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
12342                      DAG.getConstant(ShiftRHS, DL, MVT::i64));
12343 }
12344 
12345 static SDValue tryCombineToBSL(SDNode *N,
12346                                 TargetLowering::DAGCombinerInfo &DCI) {
12347   EVT VT = N->getValueType(0);
12348   SelectionDAG &DAG = DCI.DAG;
12349   SDLoc DL(N);
12350 
12351   if (!VT.isVector())
12352     return SDValue();
12353 
12354   SDValue N0 = N->getOperand(0);
12355   if (N0.getOpcode() != ISD::AND)
12356     return SDValue();
12357 
12358   SDValue N1 = N->getOperand(1);
12359   if (N1.getOpcode() != ISD::AND)
12360     return SDValue();
12361 
12362   // We only have to look for constant vectors here since the general, variable
12363   // case can be handled in TableGen.
12364   unsigned Bits = VT.getScalarSizeInBits();
12365   uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
12366   for (int i = 1; i >= 0; --i)
12367     for (int j = 1; j >= 0; --j) {
12368       BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
12369       BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
12370       if (!BVN0 || !BVN1)
12371         continue;
12372 
12373       bool FoundMatch = true;
12374       for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
12375         ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
12376         ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
12377         if (!CN0 || !CN1 ||
12378             CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
12379           FoundMatch = false;
12380           break;
12381         }
12382       }
12383 
12384       if (FoundMatch)
12385         return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
12386                            N0->getOperand(1 - i), N1->getOperand(1 - j));
12387     }
12388 
12389   return SDValue();
12390 }
12391 
12392 static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
12393                                 const AArch64Subtarget *Subtarget) {
12394   // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
12395   SelectionDAG &DAG = DCI.DAG;
12396   EVT VT = N->getValueType(0);
12397 
12398   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
12399     return SDValue();
12400 
12401   if (SDValue Res = tryCombineToEXTR(N, DCI))
12402     return Res;
12403 
12404   if (SDValue Res = tryCombineToBSL(N, DCI))
12405     return Res;
12406 
12407   return SDValue();
12408 }
12409 
12410 static bool isConstantSplatVectorMaskForType(SDNode *N, EVT MemVT) {
12411   if (!MemVT.getVectorElementType().isSimple())
12412     return false;
12413 
12414   uint64_t MaskForTy = 0ull;
12415   switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
12416   case MVT::i8:
12417     MaskForTy = 0xffull;
12418     break;
12419   case MVT::i16:
12420     MaskForTy = 0xffffull;
12421     break;
12422   case MVT::i32:
12423     MaskForTy = 0xffffffffull;
12424     break;
12425   default:
12426     return false;
12427     break;
12428   }
12429 
12430   if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
12431     if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
12432       return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
12433 
12434   return false;
12435 }
12436 
12437 static SDValue performSVEAndCombine(SDNode *N,
12438                                     TargetLowering::DAGCombinerInfo &DCI) {
12439   if (DCI.isBeforeLegalizeOps())
12440     return SDValue();
12441 
12442   SelectionDAG &DAG = DCI.DAG;
12443   SDValue Src = N->getOperand(0);
12444   unsigned Opc = Src->getOpcode();
12445 
12446   // Zero/any extend of an unsigned unpack
12447   if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
12448     SDValue UnpkOp = Src->getOperand(0);
12449     SDValue Dup = N->getOperand(1);
12450 
12451     if (Dup.getOpcode() != AArch64ISD::DUP)
12452       return SDValue();
12453 
12454     SDLoc DL(N);
12455     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
12456     uint64_t ExtVal = C->getZExtValue();
12457 
12458     // If the mask is fully covered by the unpack, we don't need to push
12459     // a new AND onto the operand
12460     EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
12461     if ((ExtVal == 0xFF && EltTy == MVT::i8) ||
12462         (ExtVal == 0xFFFF && EltTy == MVT::i16) ||
12463         (ExtVal == 0xFFFFFFFF && EltTy == MVT::i32))
12464       return Src;
12465 
12466     // Truncate to prevent a DUP with an over wide constant
12467     APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
12468 
12469     // Otherwise, make sure we propagate the AND to the operand
12470     // of the unpack
12471     Dup = DAG.getNode(AArch64ISD::DUP, DL,
12472                       UnpkOp->getValueType(0),
12473                       DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
12474 
12475     SDValue And = DAG.getNode(ISD::AND, DL,
12476                               UnpkOp->getValueType(0), UnpkOp, Dup);
12477 
12478     return DAG.getNode(Opc, DL, N->getValueType(0), And);
12479   }
12480 
12481   if (!EnableCombineMGatherIntrinsics)
12482     return SDValue();
12483 
12484   SDValue Mask = N->getOperand(1);
12485 
12486   if (!Src.hasOneUse())
12487     return SDValue();
12488 
12489   EVT MemVT;
12490 
12491   // SVE load instructions perform an implicit zero-extend, which makes them
12492   // perfect candidates for combining.
12493   switch (Opc) {
12494   case AArch64ISD::LD1_MERGE_ZERO:
12495   case AArch64ISD::LDNF1_MERGE_ZERO:
12496   case AArch64ISD::LDFF1_MERGE_ZERO:
12497     MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
12498     break;
12499   case AArch64ISD::GLD1_MERGE_ZERO:
12500   case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
12501   case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
12502   case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
12503   case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
12504   case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
12505   case AArch64ISD::GLD1_IMM_MERGE_ZERO:
12506   case AArch64ISD::GLDFF1_MERGE_ZERO:
12507   case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
12508   case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
12509   case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
12510   case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
12511   case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
12512   case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
12513   case AArch64ISD::GLDNT1_MERGE_ZERO:
12514     MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
12515     break;
12516   default:
12517     return SDValue();
12518   }
12519 
12520   if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
12521     return Src;
12522 
12523   return SDValue();
12524 }
12525 
12526 static SDValue performANDCombine(SDNode *N,
12527                                  TargetLowering::DAGCombinerInfo &DCI) {
12528   SelectionDAG &DAG = DCI.DAG;
12529   SDValue LHS = N->getOperand(0);
12530   EVT VT = N->getValueType(0);
12531   if (!VT.isVector() || !DAG.getTargetLoweringInfo().isTypeLegal(VT))
12532     return SDValue();
12533 
12534   if (VT.isScalableVector())
12535     return performSVEAndCombine(N, DCI);
12536 
12537   // The combining code below works only for NEON vectors. In particular, it
12538   // does not work for SVE when dealing with vectors wider than 128 bits.
12539   if (!(VT.is64BitVector() || VT.is128BitVector()))
12540     return SDValue();
12541 
12542   BuildVectorSDNode *BVN =
12543       dyn_cast<BuildVectorSDNode>(N->getOperand(1).getNode());
12544   if (!BVN)
12545     return SDValue();
12546 
12547   // AND does not accept an immediate, so check if we can use a BIC immediate
12548   // instruction instead. We do this here instead of using a (and x, (mvni imm))
12549   // pattern in isel, because some immediates may be lowered to the preferred
12550   // (and x, (movi imm)) form, even though an mvni representation also exists.
12551   APInt DefBits(VT.getSizeInBits(), 0);
12552   APInt UndefBits(VT.getSizeInBits(), 0);
12553   if (resolveBuildVector(BVN, DefBits, UndefBits)) {
12554     SDValue NewOp;
12555 
12556     DefBits = ~DefBits;
12557     if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
12558                                     DefBits, &LHS)) ||
12559         (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
12560                                     DefBits, &LHS)))
12561       return NewOp;
12562 
12563     UndefBits = ~UndefBits;
12564     if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
12565                                     UndefBits, &LHS)) ||
12566         (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
12567                                     UndefBits, &LHS)))
12568       return NewOp;
12569   }
12570 
12571   return SDValue();
12572 }
12573 
12574 static SDValue performSRLCombine(SDNode *N,
12575                                  TargetLowering::DAGCombinerInfo &DCI) {
12576   SelectionDAG &DAG = DCI.DAG;
12577   EVT VT = N->getValueType(0);
12578   if (VT != MVT::i32 && VT != MVT::i64)
12579     return SDValue();
12580 
12581   // Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the
12582   // high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32)
12583   // to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero.
12584   SDValue N0 = N->getOperand(0);
12585   if (N0.getOpcode() == ISD::BSWAP) {
12586     SDLoc DL(N);
12587     SDValue N1 = N->getOperand(1);
12588     SDValue N00 = N0.getOperand(0);
12589     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
12590       uint64_t ShiftAmt = C->getZExtValue();
12591       if (VT == MVT::i32 && ShiftAmt == 16 &&
12592           DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16)))
12593         return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
12594       if (VT == MVT::i64 && ShiftAmt == 32 &&
12595           DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32)))
12596         return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
12597     }
12598   }
12599   return SDValue();
12600 }
12601 
12602 // Attempt to form urhadd(OpA, OpB) from
12603 // truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1))
12604 // or uhadd(OpA, OpB) from truncate(vlshr(add(zext(OpA), zext(OpB)), 1)).
12605 // The original form of the first expression is
12606 // truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and the
12607 // (OpA + OpB + 1) subexpression will have been changed to (OpB - (~OpA)).
12608 // Before this function is called the srl will have been lowered to
12609 // AArch64ISD::VLSHR.
12610 // This pass can also recognize signed variants of the patterns that use sign
12611 // extension instead of zero extension and form a srhadd(OpA, OpB) or a
12612 // shadd(OpA, OpB) from them.
12613 static SDValue
12614 performVectorTruncateCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
12615                              SelectionDAG &DAG) {
12616   EVT VT = N->getValueType(0);
12617 
12618   // Since we are looking for a right shift by a constant value of 1 and we are
12619   // operating on types at least 16 bits in length (sign/zero extended OpA and
12620   // OpB, which are at least 8 bits), it follows that the truncate will always
12621   // discard the shifted-in bit and therefore the right shift will be logical
12622   // regardless of the signedness of OpA and OpB.
12623   SDValue Shift = N->getOperand(0);
12624   if (Shift.getOpcode() != AArch64ISD::VLSHR)
12625     return SDValue();
12626 
12627   // Is the right shift using an immediate value of 1?
12628   uint64_t ShiftAmount = Shift.getConstantOperandVal(1);
12629   if (ShiftAmount != 1)
12630     return SDValue();
12631 
12632   SDValue ExtendOpA, ExtendOpB;
12633   SDValue ShiftOp0 = Shift.getOperand(0);
12634   unsigned ShiftOp0Opc = ShiftOp0.getOpcode();
12635   if (ShiftOp0Opc == ISD::SUB) {
12636 
12637     SDValue Xor = ShiftOp0.getOperand(1);
12638     if (Xor.getOpcode() != ISD::XOR)
12639       return SDValue();
12640 
12641     // Is the XOR using a constant amount of all ones in the right hand side?
12642     uint64_t C;
12643     if (!isAllConstantBuildVector(Xor.getOperand(1), C))
12644       return SDValue();
12645 
12646     unsigned ElemSizeInBits = VT.getScalarSizeInBits();
12647     APInt CAsAPInt(ElemSizeInBits, C);
12648     if (CAsAPInt != APInt::getAllOnesValue(ElemSizeInBits))
12649       return SDValue();
12650 
12651     ExtendOpA = Xor.getOperand(0);
12652     ExtendOpB = ShiftOp0.getOperand(0);
12653   } else if (ShiftOp0Opc == ISD::ADD) {
12654     ExtendOpA = ShiftOp0.getOperand(0);
12655     ExtendOpB = ShiftOp0.getOperand(1);
12656   } else
12657     return SDValue();
12658 
12659   unsigned ExtendOpAOpc = ExtendOpA.getOpcode();
12660   unsigned ExtendOpBOpc = ExtendOpB.getOpcode();
12661   if (!(ExtendOpAOpc == ExtendOpBOpc &&
12662         (ExtendOpAOpc == ISD::ZERO_EXTEND || ExtendOpAOpc == ISD::SIGN_EXTEND)))
12663     return SDValue();
12664 
12665   // Is the result of the right shift being truncated to the same value type as
12666   // the original operands, OpA and OpB?
12667   SDValue OpA = ExtendOpA.getOperand(0);
12668   SDValue OpB = ExtendOpB.getOperand(0);
12669   EVT OpAVT = OpA.getValueType();
12670   assert(ExtendOpA.getValueType() == ExtendOpB.getValueType());
12671   if (!(VT == OpAVT && OpAVT == OpB.getValueType()))
12672     return SDValue();
12673 
12674   SDLoc DL(N);
12675   bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND;
12676   bool IsRHADD = ShiftOp0Opc == ISD::SUB;
12677   unsigned HADDOpc = IsSignExtend
12678                          ? (IsRHADD ? AArch64ISD::SRHADD : AArch64ISD::SHADD)
12679                          : (IsRHADD ? AArch64ISD::URHADD : AArch64ISD::UHADD);
12680   SDValue ResultHADD = DAG.getNode(HADDOpc, DL, VT, OpA, OpB);
12681 
12682   return ResultHADD;
12683 }
12684 
12685 static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
12686   switch (Opcode) {
12687   case ISD::FADD:
12688     return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
12689   case ISD::ADD:
12690     return VT == MVT::i64;
12691   default:
12692     return false;
12693   }
12694 }
12695 
12696 static SDValue performExtractVectorEltCombine(SDNode *N, SelectionDAG &DAG) {
12697   SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
12698   ConstantSDNode *ConstantN1 = dyn_cast<ConstantSDNode>(N1);
12699 
12700   EVT VT = N->getValueType(0);
12701   const bool FullFP16 =
12702       static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
12703 
12704   // Rewrite for pairwise fadd pattern
12705   //   (f32 (extract_vector_elt
12706   //           (fadd (vXf32 Other)
12707   //                 (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
12708   // ->
12709   //   (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
12710   //              (extract_vector_elt (vXf32 Other) 1))
12711   if (ConstantN1 && ConstantN1->getZExtValue() == 0 &&
12712       hasPairwiseAdd(N0->getOpcode(), VT, FullFP16)) {
12713     SDLoc DL(N0);
12714     SDValue N00 = N0->getOperand(0);
12715     SDValue N01 = N0->getOperand(1);
12716 
12717     ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
12718     SDValue Other = N00;
12719 
12720     // And handle the commutative case.
12721     if (!Shuffle) {
12722       Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
12723       Other = N01;
12724     }
12725 
12726     if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
12727         Other == Shuffle->getOperand(0)) {
12728       return DAG.getNode(N0->getOpcode(), DL, VT,
12729                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
12730                                      DAG.getConstant(0, DL, MVT::i64)),
12731                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
12732                                      DAG.getConstant(1, DL, MVT::i64)));
12733     }
12734   }
12735 
12736   return SDValue();
12737 }
12738 
12739 static SDValue performConcatVectorsCombine(SDNode *N,
12740                                            TargetLowering::DAGCombinerInfo &DCI,
12741                                            SelectionDAG &DAG) {
12742   SDLoc dl(N);
12743   EVT VT = N->getValueType(0);
12744   SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
12745   unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
12746 
12747   // Optimize concat_vectors of truncated vectors, where the intermediate
12748   // type is illegal, to avoid said illegality,  e.g.,
12749   //   (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
12750   //                          (v2i16 (truncate (v2i64)))))
12751   // ->
12752   //   (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
12753   //                                    (v4i32 (bitcast (v2i64))),
12754   //                                    <0, 2, 4, 6>)))
12755   // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
12756   // on both input and result type, so we might generate worse code.
12757   // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
12758   if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
12759       N1Opc == ISD::TRUNCATE) {
12760     SDValue N00 = N0->getOperand(0);
12761     SDValue N10 = N1->getOperand(0);
12762     EVT N00VT = N00.getValueType();
12763 
12764     if (N00VT == N10.getValueType() &&
12765         (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
12766         N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
12767       MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
12768       SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
12769       for (size_t i = 0; i < Mask.size(); ++i)
12770         Mask[i] = i * 2;
12771       return DAG.getNode(ISD::TRUNCATE, dl, VT,
12772                          DAG.getVectorShuffle(
12773                              MidVT, dl,
12774                              DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
12775                              DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
12776     }
12777   }
12778 
12779   // Wait 'til after everything is legalized to try this. That way we have
12780   // legal vector types and such.
12781   if (DCI.isBeforeLegalizeOps())
12782     return SDValue();
12783 
12784   // Optimise concat_vectors of two [us]rhadds or [us]hadds that use extracted
12785   // subvectors from the same original vectors. Combine these into a single
12786   // [us]rhadd or [us]hadd that operates on the two original vectors. Example:
12787   //  (v16i8 (concat_vectors (v8i8 (urhadd (extract_subvector (v16i8 OpA, <0>),
12788   //                                        extract_subvector (v16i8 OpB,
12789   //                                        <0>))),
12790   //                         (v8i8 (urhadd (extract_subvector (v16i8 OpA, <8>),
12791   //                                        extract_subvector (v16i8 OpB,
12792   //                                        <8>)))))
12793   // ->
12794   //  (v16i8(urhadd(v16i8 OpA, v16i8 OpB)))
12795   if (N->getNumOperands() == 2 && N0Opc == N1Opc &&
12796       (N0Opc == AArch64ISD::URHADD || N0Opc == AArch64ISD::SRHADD ||
12797        N0Opc == AArch64ISD::UHADD || N0Opc == AArch64ISD::SHADD)) {
12798     SDValue N00 = N0->getOperand(0);
12799     SDValue N01 = N0->getOperand(1);
12800     SDValue N10 = N1->getOperand(0);
12801     SDValue N11 = N1->getOperand(1);
12802 
12803     EVT N00VT = N00.getValueType();
12804     EVT N10VT = N10.getValueType();
12805 
12806     if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
12807         N01->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
12808         N10->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
12809         N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) {
12810       SDValue N00Source = N00->getOperand(0);
12811       SDValue N01Source = N01->getOperand(0);
12812       SDValue N10Source = N10->getOperand(0);
12813       SDValue N11Source = N11->getOperand(0);
12814 
12815       if (N00Source == N10Source && N01Source == N11Source &&
12816           N00Source.getValueType() == VT && N01Source.getValueType() == VT) {
12817         assert(N0.getValueType() == N1.getValueType());
12818 
12819         uint64_t N00Index = N00.getConstantOperandVal(1);
12820         uint64_t N01Index = N01.getConstantOperandVal(1);
12821         uint64_t N10Index = N10.getConstantOperandVal(1);
12822         uint64_t N11Index = N11.getConstantOperandVal(1);
12823 
12824         if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 &&
12825             N10Index == N00VT.getVectorNumElements())
12826           return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source);
12827       }
12828     }
12829   }
12830 
12831   // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
12832   // splat. The indexed instructions are going to be expecting a DUPLANE64, so
12833   // canonicalise to that.
12834   if (N0 == N1 && VT.getVectorNumElements() == 2) {
12835     assert(VT.getScalarSizeInBits() == 64);
12836     return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
12837                        DAG.getConstant(0, dl, MVT::i64));
12838   }
12839 
12840   // Canonicalise concat_vectors so that the right-hand vector has as few
12841   // bit-casts as possible before its real operation. The primary matching
12842   // destination for these operations will be the narrowing "2" instructions,
12843   // which depend on the operation being performed on this right-hand vector.
12844   // For example,
12845   //    (concat_vectors LHS,  (v1i64 (bitconvert (v4i16 RHS))))
12846   // becomes
12847   //    (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
12848 
12849   if (N1Opc != ISD::BITCAST)
12850     return SDValue();
12851   SDValue RHS = N1->getOperand(0);
12852   MVT RHSTy = RHS.getValueType().getSimpleVT();
12853   // If the RHS is not a vector, this is not the pattern we're looking for.
12854   if (!RHSTy.isVector())
12855     return SDValue();
12856 
12857   LLVM_DEBUG(
12858       dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
12859 
12860   MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
12861                                   RHSTy.getVectorNumElements() * 2);
12862   return DAG.getNode(ISD::BITCAST, dl, VT,
12863                      DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
12864                                  DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
12865                                  RHS));
12866 }
12867 
12868 static SDValue tryCombineFixedPointConvert(SDNode *N,
12869                                            TargetLowering::DAGCombinerInfo &DCI,
12870                                            SelectionDAG &DAG) {
12871   // Wait until after everything is legalized to try this. That way we have
12872   // legal vector types and such.
12873   if (DCI.isBeforeLegalizeOps())
12874     return SDValue();
12875   // Transform a scalar conversion of a value from a lane extract into a
12876   // lane extract of a vector conversion. E.g., from foo1 to foo2:
12877   // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
12878   // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
12879   //
12880   // The second form interacts better with instruction selection and the
12881   // register allocator to avoid cross-class register copies that aren't
12882   // coalescable due to a lane reference.
12883 
12884   // Check the operand and see if it originates from a lane extract.
12885   SDValue Op1 = N->getOperand(1);
12886   if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
12887     // Yep, no additional predication needed. Perform the transform.
12888     SDValue IID = N->getOperand(0);
12889     SDValue Shift = N->getOperand(2);
12890     SDValue Vec = Op1.getOperand(0);
12891     SDValue Lane = Op1.getOperand(1);
12892     EVT ResTy = N->getValueType(0);
12893     EVT VecResTy;
12894     SDLoc DL(N);
12895 
12896     // The vector width should be 128 bits by the time we get here, even
12897     // if it started as 64 bits (the extract_vector handling will have
12898     // done so).
12899     assert(Vec.getValueSizeInBits() == 128 &&
12900            "unexpected vector size on extract_vector_elt!");
12901     if (Vec.getValueType() == MVT::v4i32)
12902       VecResTy = MVT::v4f32;
12903     else if (Vec.getValueType() == MVT::v2i64)
12904       VecResTy = MVT::v2f64;
12905     else
12906       llvm_unreachable("unexpected vector type!");
12907 
12908     SDValue Convert =
12909         DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
12910     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
12911   }
12912   return SDValue();
12913 }
12914 
12915 // AArch64 high-vector "long" operations are formed by performing the non-high
12916 // version on an extract_subvector of each operand which gets the high half:
12917 //
12918 //  (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
12919 //
12920 // However, there are cases which don't have an extract_high explicitly, but
12921 // have another operation that can be made compatible with one for free. For
12922 // example:
12923 //
12924 //  (dupv64 scalar) --> (extract_high (dup128 scalar))
12925 //
12926 // This routine does the actual conversion of such DUPs, once outer routines
12927 // have determined that everything else is in order.
12928 // It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
12929 // similarly here.
12930 static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
12931   switch (N.getOpcode()) {
12932   case AArch64ISD::DUP:
12933   case AArch64ISD::DUPLANE8:
12934   case AArch64ISD::DUPLANE16:
12935   case AArch64ISD::DUPLANE32:
12936   case AArch64ISD::DUPLANE64:
12937   case AArch64ISD::MOVI:
12938   case AArch64ISD::MOVIshift:
12939   case AArch64ISD::MOVIedit:
12940   case AArch64ISD::MOVImsl:
12941   case AArch64ISD::MVNIshift:
12942   case AArch64ISD::MVNImsl:
12943     break;
12944   default:
12945     // FMOV could be supported, but isn't very useful, as it would only occur
12946     // if you passed a bitcast' floating point immediate to an eligible long
12947     // integer op (addl, smull, ...).
12948     return SDValue();
12949   }
12950 
12951   MVT NarrowTy = N.getSimpleValueType();
12952   if (!NarrowTy.is64BitVector())
12953     return SDValue();
12954 
12955   MVT ElementTy = NarrowTy.getVectorElementType();
12956   unsigned NumElems = NarrowTy.getVectorNumElements();
12957   MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
12958 
12959   SDLoc dl(N);
12960   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy,
12961                      DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()),
12962                      DAG.getConstant(NumElems, dl, MVT::i64));
12963 }
12964 
12965 static bool isEssentiallyExtractHighSubvector(SDValue N) {
12966   if (N.getOpcode() == ISD::BITCAST)
12967     N = N.getOperand(0);
12968   if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
12969     return false;
12970   return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() ==
12971          N.getOperand(0).getValueType().getVectorNumElements() / 2;
12972 }
12973 
12974 /// Helper structure to keep track of ISD::SET_CC operands.
12975 struct GenericSetCCInfo {
12976   const SDValue *Opnd0;
12977   const SDValue *Opnd1;
12978   ISD::CondCode CC;
12979 };
12980 
12981 /// Helper structure to keep track of a SET_CC lowered into AArch64 code.
12982 struct AArch64SetCCInfo {
12983   const SDValue *Cmp;
12984   AArch64CC::CondCode CC;
12985 };
12986 
12987 /// Helper structure to keep track of SetCC information.
12988 union SetCCInfo {
12989   GenericSetCCInfo Generic;
12990   AArch64SetCCInfo AArch64;
12991 };
12992 
12993 /// Helper structure to be able to read SetCC information.  If set to
12994 /// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
12995 /// GenericSetCCInfo.
12996 struct SetCCInfoAndKind {
12997   SetCCInfo Info;
12998   bool IsAArch64;
12999 };
13000 
13001 /// Check whether or not \p Op is a SET_CC operation, either a generic or
13002 /// an
13003 /// AArch64 lowered one.
13004 /// \p SetCCInfo is filled accordingly.
13005 /// \post SetCCInfo is meanginfull only when this function returns true.
13006 /// \return True when Op is a kind of SET_CC operation.
13007 static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
13008   // If this is a setcc, this is straight forward.
13009   if (Op.getOpcode() == ISD::SETCC) {
13010     SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
13011     SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
13012     SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
13013     SetCCInfo.IsAArch64 = false;
13014     return true;
13015   }
13016   // Otherwise, check if this is a matching csel instruction.
13017   // In other words:
13018   // - csel 1, 0, cc
13019   // - csel 0, 1, !cc
13020   if (Op.getOpcode() != AArch64ISD::CSEL)
13021     return false;
13022   // Set the information about the operands.
13023   // TODO: we want the operands of the Cmp not the csel
13024   SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
13025   SetCCInfo.IsAArch64 = true;
13026   SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
13027       cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
13028 
13029   // Check that the operands matches the constraints:
13030   // (1) Both operands must be constants.
13031   // (2) One must be 1 and the other must be 0.
13032   ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
13033   ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
13034 
13035   // Check (1).
13036   if (!TValue || !FValue)
13037     return false;
13038 
13039   // Check (2).
13040   if (!TValue->isOne()) {
13041     // Update the comparison when we are interested in !cc.
13042     std::swap(TValue, FValue);
13043     SetCCInfo.Info.AArch64.CC =
13044         AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
13045   }
13046   return TValue->isOne() && FValue->isNullValue();
13047 }
13048 
13049 // Returns true if Op is setcc or zext of setcc.
13050 static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
13051   if (isSetCC(Op, Info))
13052     return true;
13053   return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
13054     isSetCC(Op->getOperand(0), Info));
13055 }
13056 
13057 // The folding we want to perform is:
13058 // (add x, [zext] (setcc cc ...) )
13059 //   -->
13060 // (csel x, (add x, 1), !cc ...)
13061 //
13062 // The latter will get matched to a CSINC instruction.
13063 static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
13064   assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
13065   SDValue LHS = Op->getOperand(0);
13066   SDValue RHS = Op->getOperand(1);
13067   SetCCInfoAndKind InfoAndKind;
13068 
13069   // If neither operand is a SET_CC, give up.
13070   if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
13071     std::swap(LHS, RHS);
13072     if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
13073       return SDValue();
13074   }
13075 
13076   // FIXME: This could be generatized to work for FP comparisons.
13077   EVT CmpVT = InfoAndKind.IsAArch64
13078                   ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
13079                   : InfoAndKind.Info.Generic.Opnd0->getValueType();
13080   if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
13081     return SDValue();
13082 
13083   SDValue CCVal;
13084   SDValue Cmp;
13085   SDLoc dl(Op);
13086   if (InfoAndKind.IsAArch64) {
13087     CCVal = DAG.getConstant(
13088         AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl,
13089         MVT::i32);
13090     Cmp = *InfoAndKind.Info.AArch64.Cmp;
13091   } else
13092     Cmp = getAArch64Cmp(
13093         *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
13094         ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
13095         dl);
13096 
13097   EVT VT = Op->getValueType(0);
13098   LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
13099   return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
13100 }
13101 
13102 // ADD(UADDV a, UADDV b) -->  UADDV(ADD a, b)
13103 static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
13104   EVT VT = N->getValueType(0);
13105   // Only scalar integer and vector types.
13106   if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
13107     return SDValue();
13108 
13109   SDValue LHS = N->getOperand(0);
13110   SDValue RHS = N->getOperand(1);
13111   if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13112       RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
13113     return SDValue();
13114 
13115   auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
13116   auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
13117   if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isNullValue())
13118     return SDValue();
13119 
13120   SDValue Op1 = LHS->getOperand(0);
13121   SDValue Op2 = RHS->getOperand(0);
13122   EVT OpVT1 = Op1.getValueType();
13123   EVT OpVT2 = Op2.getValueType();
13124   if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
13125       Op2.getOpcode() != AArch64ISD::UADDV ||
13126       OpVT1.getVectorElementType() != VT)
13127     return SDValue();
13128 
13129   SDValue Val1 = Op1.getOperand(0);
13130   SDValue Val2 = Op2.getOperand(0);
13131   EVT ValVT = Val1->getValueType(0);
13132   SDLoc DL(N);
13133   SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
13134   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
13135                      DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
13136                      DAG.getConstant(0, DL, MVT::i64));
13137 }
13138 
13139 // The basic add/sub long vector instructions have variants with "2" on the end
13140 // which act on the high-half of their inputs. They are normally matched by
13141 // patterns like:
13142 //
13143 // (add (zeroext (extract_high LHS)),
13144 //      (zeroext (extract_high RHS)))
13145 // -> uaddl2 vD, vN, vM
13146 //
13147 // However, if one of the extracts is something like a duplicate, this
13148 // instruction can still be used profitably. This function puts the DAG into a
13149 // more appropriate form for those patterns to trigger.
13150 static SDValue performAddSubLongCombine(SDNode *N,
13151                                         TargetLowering::DAGCombinerInfo &DCI,
13152                                         SelectionDAG &DAG) {
13153   if (DCI.isBeforeLegalizeOps())
13154     return SDValue();
13155 
13156   MVT VT = N->getSimpleValueType(0);
13157   if (!VT.is128BitVector()) {
13158     if (N->getOpcode() == ISD::ADD)
13159       return performSetccAddFolding(N, DAG);
13160     return SDValue();
13161   }
13162 
13163   // Make sure both branches are extended in the same way.
13164   SDValue LHS = N->getOperand(0);
13165   SDValue RHS = N->getOperand(1);
13166   if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
13167        LHS.getOpcode() != ISD::SIGN_EXTEND) ||
13168       LHS.getOpcode() != RHS.getOpcode())
13169     return SDValue();
13170 
13171   unsigned ExtType = LHS.getOpcode();
13172 
13173   // It's not worth doing if at least one of the inputs isn't already an
13174   // extract, but we don't know which it'll be so we have to try both.
13175   if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
13176     RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
13177     if (!RHS.getNode())
13178       return SDValue();
13179 
13180     RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
13181   } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
13182     LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
13183     if (!LHS.getNode())
13184       return SDValue();
13185 
13186     LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
13187   }
13188 
13189   return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
13190 }
13191 
13192 static SDValue performAddSubCombine(SDNode *N,
13193                                     TargetLowering::DAGCombinerInfo &DCI,
13194                                     SelectionDAG &DAG) {
13195   // Try to change sum of two reductions.
13196   if (SDValue Val = performUADDVCombine(N, DAG))
13197     return Val;
13198 
13199   return performAddSubLongCombine(N, DCI, DAG);
13200 }
13201 
13202 // Massage DAGs which we can use the high-half "long" operations on into
13203 // something isel will recognize better. E.g.
13204 //
13205 // (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
13206 //   (aarch64_neon_umull (extract_high (v2i64 vec)))
13207 //                     (extract_high (v2i64 (dup128 scalar)))))
13208 //
13209 static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
13210                                        TargetLowering::DAGCombinerInfo &DCI,
13211                                        SelectionDAG &DAG) {
13212   if (DCI.isBeforeLegalizeOps())
13213     return SDValue();
13214 
13215   SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
13216   SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
13217   assert(LHS.getValueType().is64BitVector() &&
13218          RHS.getValueType().is64BitVector() &&
13219          "unexpected shape for long operation");
13220 
13221   // Either node could be a DUP, but it's not worth doing both of them (you'd
13222   // just as well use the non-high version) so look for a corresponding extract
13223   // operation on the other "wing".
13224   if (isEssentiallyExtractHighSubvector(LHS)) {
13225     RHS = tryExtendDUPToExtractHigh(RHS, DAG);
13226     if (!RHS.getNode())
13227       return SDValue();
13228   } else if (isEssentiallyExtractHighSubvector(RHS)) {
13229     LHS = tryExtendDUPToExtractHigh(LHS, DAG);
13230     if (!LHS.getNode())
13231       return SDValue();
13232   }
13233 
13234   if (IID == Intrinsic::not_intrinsic)
13235     return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
13236 
13237   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
13238                      N->getOperand(0), LHS, RHS);
13239 }
13240 
13241 static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
13242   MVT ElemTy = N->getSimpleValueType(0).getScalarType();
13243   unsigned ElemBits = ElemTy.getSizeInBits();
13244 
13245   int64_t ShiftAmount;
13246   if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
13247     APInt SplatValue, SplatUndef;
13248     unsigned SplatBitSize;
13249     bool HasAnyUndefs;
13250     if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
13251                               HasAnyUndefs, ElemBits) ||
13252         SplatBitSize != ElemBits)
13253       return SDValue();
13254 
13255     ShiftAmount = SplatValue.getSExtValue();
13256   } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
13257     ShiftAmount = CVN->getSExtValue();
13258   } else
13259     return SDValue();
13260 
13261   unsigned Opcode;
13262   bool IsRightShift;
13263   switch (IID) {
13264   default:
13265     llvm_unreachable("Unknown shift intrinsic");
13266   case Intrinsic::aarch64_neon_sqshl:
13267     Opcode = AArch64ISD::SQSHL_I;
13268     IsRightShift = false;
13269     break;
13270   case Intrinsic::aarch64_neon_uqshl:
13271     Opcode = AArch64ISD::UQSHL_I;
13272     IsRightShift = false;
13273     break;
13274   case Intrinsic::aarch64_neon_srshl:
13275     Opcode = AArch64ISD::SRSHR_I;
13276     IsRightShift = true;
13277     break;
13278   case Intrinsic::aarch64_neon_urshl:
13279     Opcode = AArch64ISD::URSHR_I;
13280     IsRightShift = true;
13281     break;
13282   case Intrinsic::aarch64_neon_sqshlu:
13283     Opcode = AArch64ISD::SQSHLU_I;
13284     IsRightShift = false;
13285     break;
13286   case Intrinsic::aarch64_neon_sshl:
13287   case Intrinsic::aarch64_neon_ushl:
13288     // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
13289     // left shift for positive shift amounts. Below, we only replace the current
13290     // node with VSHL, if this condition is met.
13291     Opcode = AArch64ISD::VSHL;
13292     IsRightShift = false;
13293     break;
13294   }
13295 
13296   if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
13297     SDLoc dl(N);
13298     return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
13299                        DAG.getConstant(-ShiftAmount, dl, MVT::i32));
13300   } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
13301     SDLoc dl(N);
13302     return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
13303                        DAG.getConstant(ShiftAmount, dl, MVT::i32));
13304   }
13305 
13306   return SDValue();
13307 }
13308 
13309 // The CRC32[BH] instructions ignore the high bits of their data operand. Since
13310 // the intrinsics must be legal and take an i32, this means there's almost
13311 // certainly going to be a zext in the DAG which we can eliminate.
13312 static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
13313   SDValue AndN = N->getOperand(2);
13314   if (AndN.getOpcode() != ISD::AND)
13315     return SDValue();
13316 
13317   ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
13318   if (!CMask || CMask->getZExtValue() != Mask)
13319     return SDValue();
13320 
13321   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
13322                      N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
13323 }
13324 
13325 static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
13326                                            SelectionDAG &DAG) {
13327   SDLoc dl(N);
13328   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
13329                      DAG.getNode(Opc, dl,
13330                                  N->getOperand(1).getSimpleValueType(),
13331                                  N->getOperand(1)),
13332                      DAG.getConstant(0, dl, MVT::i64));
13333 }
13334 
13335 static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) {
13336   SDLoc DL(N);
13337   SDValue Op1 = N->getOperand(1);
13338   SDValue Op2 = N->getOperand(2);
13339   EVT ScalarTy = Op1.getValueType();
13340 
13341   if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) {
13342     Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
13343     Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
13344   }
13345 
13346   return DAG.getNode(AArch64ISD::INDEX_VECTOR, DL, N->getValueType(0),
13347                      Op1, Op2);
13348 }
13349 
13350 static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) {
13351   SDLoc dl(N);
13352   SDValue Scalar = N->getOperand(3);
13353   EVT ScalarTy = Scalar.getValueType();
13354 
13355   if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
13356     Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
13357 
13358   SDValue Passthru = N->getOperand(1);
13359   SDValue Pred = N->getOperand(2);
13360   return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
13361                      Pred, Scalar, Passthru);
13362 }
13363 
13364 static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) {
13365   SDLoc dl(N);
13366   LLVMContext &Ctx = *DAG.getContext();
13367   EVT VT = N->getValueType(0);
13368 
13369   assert(VT.isScalableVector() && "Expected a scalable vector.");
13370 
13371   // Current lowering only supports the SVE-ACLE types.
13372   if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock)
13373     return SDValue();
13374 
13375   unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
13376   unsigned ByteSize = VT.getSizeInBits().getKnownMinSize() / 8;
13377   EVT ByteVT =
13378       EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
13379 
13380   // Convert everything to the domain of EXT (i.e bytes).
13381   SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
13382   SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
13383   SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
13384                             DAG.getConstant(ElemSize, dl, MVT::i32));
13385 
13386   SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
13387   return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
13388 }
13389 
13390 static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC,
13391                                         TargetLowering::DAGCombinerInfo &DCI,
13392                                         SelectionDAG &DAG) {
13393   if (DCI.isBeforeLegalize())
13394     return SDValue();
13395 
13396   SDValue Comparator = N->getOperand(3);
13397   if (Comparator.getOpcode() == AArch64ISD::DUP ||
13398       Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
13399     unsigned IID = getIntrinsicID(N);
13400     EVT VT = N->getValueType(0);
13401     EVT CmpVT = N->getOperand(2).getValueType();
13402     SDValue Pred = N->getOperand(1);
13403     SDValue Imm;
13404     SDLoc DL(N);
13405 
13406     switch (IID) {
13407     default:
13408       llvm_unreachable("Called with wrong intrinsic!");
13409       break;
13410 
13411     // Signed comparisons
13412     case Intrinsic::aarch64_sve_cmpeq_wide:
13413     case Intrinsic::aarch64_sve_cmpne_wide:
13414     case Intrinsic::aarch64_sve_cmpge_wide:
13415     case Intrinsic::aarch64_sve_cmpgt_wide:
13416     case Intrinsic::aarch64_sve_cmplt_wide:
13417     case Intrinsic::aarch64_sve_cmple_wide: {
13418       if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
13419         int64_t ImmVal = CN->getSExtValue();
13420         if (ImmVal >= -16 && ImmVal <= 15)
13421           Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
13422         else
13423           return SDValue();
13424       }
13425       break;
13426     }
13427     // Unsigned comparisons
13428     case Intrinsic::aarch64_sve_cmphs_wide:
13429     case Intrinsic::aarch64_sve_cmphi_wide:
13430     case Intrinsic::aarch64_sve_cmplo_wide:
13431     case Intrinsic::aarch64_sve_cmpls_wide:  {
13432       if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
13433         uint64_t ImmVal = CN->getZExtValue();
13434         if (ImmVal <= 127)
13435           Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
13436         else
13437           return SDValue();
13438       }
13439       break;
13440     }
13441     }
13442 
13443     if (!Imm)
13444       return SDValue();
13445 
13446     SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
13447     return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
13448                        N->getOperand(2), Splat, DAG.getCondCode(CC));
13449   }
13450 
13451   return SDValue();
13452 }
13453 
13454 static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
13455                         AArch64CC::CondCode Cond) {
13456   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
13457 
13458   SDLoc DL(Op);
13459   assert(Op.getValueType().isScalableVector() &&
13460          TLI.isTypeLegal(Op.getValueType()) &&
13461          "Expected legal scalable vector type!");
13462 
13463   // Ensure target specific opcodes are using legal type.
13464   EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
13465   SDValue TVal = DAG.getConstant(1, DL, OutVT);
13466   SDValue FVal = DAG.getConstant(0, DL, OutVT);
13467 
13468   // Set condition code (CC) flags.
13469   SDValue Test = DAG.getNode(AArch64ISD::PTEST, DL, MVT::Other, Pg, Op);
13470 
13471   // Convert CC to integer based on requested condition.
13472   // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
13473   SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
13474   SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
13475   return DAG.getZExtOrTrunc(Res, DL, VT);
13476 }
13477 
13478 static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc,
13479                                       SelectionDAG &DAG) {
13480   SDLoc DL(N);
13481 
13482   SDValue Pred = N->getOperand(1);
13483   SDValue VecToReduce = N->getOperand(2);
13484 
13485   // NOTE: The integer reduction's result type is not always linked to the
13486   // operand's element type so we construct it from the intrinsic's result type.
13487   EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
13488   SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
13489 
13490   // SVE reductions set the whole vector register with the first element
13491   // containing the reduction result, which we'll now extract.
13492   SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
13493   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
13494                      Zero);
13495 }
13496 
13497 static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc,
13498                                      SelectionDAG &DAG) {
13499   SDLoc DL(N);
13500 
13501   SDValue Pred = N->getOperand(1);
13502   SDValue VecToReduce = N->getOperand(2);
13503 
13504   EVT ReduceVT = VecToReduce.getValueType();
13505   SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
13506 
13507   // SVE reductions set the whole vector register with the first element
13508   // containing the reduction result, which we'll now extract.
13509   SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
13510   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
13511                      Zero);
13512 }
13513 
13514 static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,
13515                                             SelectionDAG &DAG) {
13516   SDLoc DL(N);
13517 
13518   SDValue Pred = N->getOperand(1);
13519   SDValue InitVal = N->getOperand(2);
13520   SDValue VecToReduce = N->getOperand(3);
13521   EVT ReduceVT = VecToReduce.getValueType();
13522 
13523   // Ordered reductions use the first lane of the result vector as the
13524   // reduction's initial value.
13525   SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
13526   InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
13527                         DAG.getUNDEF(ReduceVT), InitVal, Zero);
13528 
13529   SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
13530 
13531   // SVE reductions set the whole vector register with the first element
13532   // containing the reduction result, which we'll now extract.
13533   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
13534                      Zero);
13535 }
13536 
13537 // If a merged operation has no inactive lanes we can relax it to a predicated
13538 // or unpredicated operation, which potentially allows better isel (perhaps
13539 // using immediate forms) or relaxing register reuse requirements.
13540 static SDValue convertMergedOpToPredOp(SDNode *N, unsigned PredOpc,
13541                                        SelectionDAG &DAG) {
13542   assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
13543   assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
13544   SDValue Pg = N->getOperand(1);
13545 
13546   // ISD way to specify an all active predicate.
13547   if ((Pg.getOpcode() == AArch64ISD::PTRUE) &&
13548       (Pg.getConstantOperandVal(0) == AArch64SVEPredPattern::all))
13549     return DAG.getNode(PredOpc, SDLoc(N), N->getValueType(0), Pg,
13550                        N->getOperand(2), N->getOperand(3));
13551 
13552   // FUTURE: SplatVector(true)
13553   return SDValue();
13554 }
13555 
13556 static SDValue performIntrinsicCombine(SDNode *N,
13557                                        TargetLowering::DAGCombinerInfo &DCI,
13558                                        const AArch64Subtarget *Subtarget) {
13559   SelectionDAG &DAG = DCI.DAG;
13560   unsigned IID = getIntrinsicID(N);
13561   switch (IID) {
13562   default:
13563     break;
13564   case Intrinsic::aarch64_neon_vcvtfxs2fp:
13565   case Intrinsic::aarch64_neon_vcvtfxu2fp:
13566     return tryCombineFixedPointConvert(N, DCI, DAG);
13567   case Intrinsic::aarch64_neon_saddv:
13568     return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
13569   case Intrinsic::aarch64_neon_uaddv:
13570     return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
13571   case Intrinsic::aarch64_neon_sminv:
13572     return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
13573   case Intrinsic::aarch64_neon_uminv:
13574     return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
13575   case Intrinsic::aarch64_neon_smaxv:
13576     return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
13577   case Intrinsic::aarch64_neon_umaxv:
13578     return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
13579   case Intrinsic::aarch64_neon_fmax:
13580     return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
13581                        N->getOperand(1), N->getOperand(2));
13582   case Intrinsic::aarch64_neon_fmin:
13583     return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
13584                        N->getOperand(1), N->getOperand(2));
13585   case Intrinsic::aarch64_neon_fmaxnm:
13586     return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
13587                        N->getOperand(1), N->getOperand(2));
13588   case Intrinsic::aarch64_neon_fminnm:
13589     return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
13590                        N->getOperand(1), N->getOperand(2));
13591   case Intrinsic::aarch64_neon_smull:
13592   case Intrinsic::aarch64_neon_umull:
13593   case Intrinsic::aarch64_neon_pmull:
13594   case Intrinsic::aarch64_neon_sqdmull:
13595     return tryCombineLongOpWithDup(IID, N, DCI, DAG);
13596   case Intrinsic::aarch64_neon_sqshl:
13597   case Intrinsic::aarch64_neon_uqshl:
13598   case Intrinsic::aarch64_neon_sqshlu:
13599   case Intrinsic::aarch64_neon_srshl:
13600   case Intrinsic::aarch64_neon_urshl:
13601   case Intrinsic::aarch64_neon_sshl:
13602   case Intrinsic::aarch64_neon_ushl:
13603     return tryCombineShiftImm(IID, N, DAG);
13604   case Intrinsic::aarch64_crc32b:
13605   case Intrinsic::aarch64_crc32cb:
13606     return tryCombineCRC32(0xff, N, DAG);
13607   case Intrinsic::aarch64_crc32h:
13608   case Intrinsic::aarch64_crc32ch:
13609     return tryCombineCRC32(0xffff, N, DAG);
13610   case Intrinsic::aarch64_sve_saddv:
13611     // There is no i64 version of SADDV because the sign is irrelevant.
13612     if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
13613       return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
13614     else
13615       return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG);
13616   case Intrinsic::aarch64_sve_uaddv:
13617     return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
13618   case Intrinsic::aarch64_sve_smaxv:
13619     return combineSVEReductionInt(N, AArch64ISD::SMAXV_PRED, DAG);
13620   case Intrinsic::aarch64_sve_umaxv:
13621     return combineSVEReductionInt(N, AArch64ISD::UMAXV_PRED, DAG);
13622   case Intrinsic::aarch64_sve_sminv:
13623     return combineSVEReductionInt(N, AArch64ISD::SMINV_PRED, DAG);
13624   case Intrinsic::aarch64_sve_uminv:
13625     return combineSVEReductionInt(N, AArch64ISD::UMINV_PRED, DAG);
13626   case Intrinsic::aarch64_sve_orv:
13627     return combineSVEReductionInt(N, AArch64ISD::ORV_PRED, DAG);
13628   case Intrinsic::aarch64_sve_eorv:
13629     return combineSVEReductionInt(N, AArch64ISD::EORV_PRED, DAG);
13630   case Intrinsic::aarch64_sve_andv:
13631     return combineSVEReductionInt(N, AArch64ISD::ANDV_PRED, DAG);
13632   case Intrinsic::aarch64_sve_index:
13633     return LowerSVEIntrinsicIndex(N, DAG);
13634   case Intrinsic::aarch64_sve_dup:
13635     return LowerSVEIntrinsicDUP(N, DAG);
13636   case Intrinsic::aarch64_sve_dup_x:
13637     return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
13638                        N->getOperand(1));
13639   case Intrinsic::aarch64_sve_ext:
13640     return LowerSVEIntrinsicEXT(N, DAG);
13641   case Intrinsic::aarch64_sve_smin:
13642     return convertMergedOpToPredOp(N, AArch64ISD::SMIN_PRED, DAG);
13643   case Intrinsic::aarch64_sve_umin:
13644     return convertMergedOpToPredOp(N, AArch64ISD::UMIN_PRED, DAG);
13645   case Intrinsic::aarch64_sve_smax:
13646     return convertMergedOpToPredOp(N, AArch64ISD::SMAX_PRED, DAG);
13647   case Intrinsic::aarch64_sve_umax:
13648     return convertMergedOpToPredOp(N, AArch64ISD::UMAX_PRED, DAG);
13649   case Intrinsic::aarch64_sve_lsl:
13650     return convertMergedOpToPredOp(N, AArch64ISD::SHL_PRED, DAG);
13651   case Intrinsic::aarch64_sve_lsr:
13652     return convertMergedOpToPredOp(N, AArch64ISD::SRL_PRED, DAG);
13653   case Intrinsic::aarch64_sve_asr:
13654     return convertMergedOpToPredOp(N, AArch64ISD::SRA_PRED, DAG);
13655   case Intrinsic::aarch64_sve_cmphs:
13656     if (!N->getOperand(2).getValueType().isFloatingPoint())
13657       return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
13658                          N->getValueType(0), N->getOperand(1), N->getOperand(2),
13659                          N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
13660     break;
13661   case Intrinsic::aarch64_sve_cmphi:
13662     if (!N->getOperand(2).getValueType().isFloatingPoint())
13663       return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
13664                          N->getValueType(0), N->getOperand(1), N->getOperand(2),
13665                          N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
13666     break;
13667   case Intrinsic::aarch64_sve_cmpge:
13668     if (!N->getOperand(2).getValueType().isFloatingPoint())
13669       return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
13670                          N->getValueType(0), N->getOperand(1), N->getOperand(2),
13671                          N->getOperand(3), DAG.getCondCode(ISD::SETGE));
13672     break;
13673   case Intrinsic::aarch64_sve_cmpgt:
13674     if (!N->getOperand(2).getValueType().isFloatingPoint())
13675       return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
13676                          N->getValueType(0), N->getOperand(1), N->getOperand(2),
13677                          N->getOperand(3), DAG.getCondCode(ISD::SETGT));
13678     break;
13679   case Intrinsic::aarch64_sve_cmpeq:
13680     if (!N->getOperand(2).getValueType().isFloatingPoint())
13681       return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
13682                          N->getValueType(0), N->getOperand(1), N->getOperand(2),
13683                          N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
13684     break;
13685   case Intrinsic::aarch64_sve_cmpne:
13686     if (!N->getOperand(2).getValueType().isFloatingPoint())
13687       return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
13688                          N->getValueType(0), N->getOperand(1), N->getOperand(2),
13689                          N->getOperand(3), DAG.getCondCode(ISD::SETNE));
13690     break;
13691   case Intrinsic::aarch64_sve_fadda:
13692     return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG);
13693   case Intrinsic::aarch64_sve_faddv:
13694     return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG);
13695   case Intrinsic::aarch64_sve_fmaxnmv:
13696     return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG);
13697   case Intrinsic::aarch64_sve_fmaxv:
13698     return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG);
13699   case Intrinsic::aarch64_sve_fminnmv:
13700     return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG);
13701   case Intrinsic::aarch64_sve_fminv:
13702     return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG);
13703   case Intrinsic::aarch64_sve_sel:
13704     return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
13705                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
13706   case Intrinsic::aarch64_sve_cmpeq_wide:
13707     return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
13708   case Intrinsic::aarch64_sve_cmpne_wide:
13709     return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
13710   case Intrinsic::aarch64_sve_cmpge_wide:
13711     return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
13712   case Intrinsic::aarch64_sve_cmpgt_wide:
13713     return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
13714   case Intrinsic::aarch64_sve_cmplt_wide:
13715     return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
13716   case Intrinsic::aarch64_sve_cmple_wide:
13717     return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
13718   case Intrinsic::aarch64_sve_cmphs_wide:
13719     return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
13720   case Intrinsic::aarch64_sve_cmphi_wide:
13721     return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
13722   case Intrinsic::aarch64_sve_cmplo_wide:
13723     return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
13724   case Intrinsic::aarch64_sve_cmpls_wide:
13725     return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
13726   case Intrinsic::aarch64_sve_ptest_any:
13727     return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
13728                     AArch64CC::ANY_ACTIVE);
13729   case Intrinsic::aarch64_sve_ptest_first:
13730     return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
13731                     AArch64CC::FIRST_ACTIVE);
13732   case Intrinsic::aarch64_sve_ptest_last:
13733     return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
13734                     AArch64CC::LAST_ACTIVE);
13735   }
13736   return SDValue();
13737 }
13738 
13739 static SDValue performExtendCombine(SDNode *N,
13740                                     TargetLowering::DAGCombinerInfo &DCI,
13741                                     SelectionDAG &DAG) {
13742   // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
13743   // we can convert that DUP into another extract_high (of a bigger DUP), which
13744   // helps the backend to decide that an sabdl2 would be useful, saving a real
13745   // extract_high operation.
13746   if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
13747       (N->getOperand(0).getOpcode() == AArch64ISD::UABD ||
13748        N->getOperand(0).getOpcode() == AArch64ISD::SABD)) {
13749     SDNode *ABDNode = N->getOperand(0).getNode();
13750     SDValue NewABD =
13751         tryCombineLongOpWithDup(Intrinsic::not_intrinsic, ABDNode, DCI, DAG);
13752     if (!NewABD.getNode())
13753       return SDValue();
13754 
13755     return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
13756   }
13757 
13758   // This is effectively a custom type legalization for AArch64.
13759   //
13760   // Type legalization will split an extend of a small, legal, type to a larger
13761   // illegal type by first splitting the destination type, often creating
13762   // illegal source types, which then get legalized in isel-confusing ways,
13763   // leading to really terrible codegen. E.g.,
13764   //   %result = v8i32 sext v8i8 %value
13765   // becomes
13766   //   %losrc = extract_subreg %value, ...
13767   //   %hisrc = extract_subreg %value, ...
13768   //   %lo = v4i32 sext v4i8 %losrc
13769   //   %hi = v4i32 sext v4i8 %hisrc
13770   // Things go rapidly downhill from there.
13771   //
13772   // For AArch64, the [sz]ext vector instructions can only go up one element
13773   // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
13774   // take two instructions.
13775   //
13776   // This implies that the most efficient way to do the extend from v8i8
13777   // to two v4i32 values is to first extend the v8i8 to v8i16, then do
13778   // the normal splitting to happen for the v8i16->v8i32.
13779 
13780   // This is pre-legalization to catch some cases where the default
13781   // type legalization will create ill-tempered code.
13782   if (!DCI.isBeforeLegalizeOps())
13783     return SDValue();
13784 
13785   // We're only interested in cleaning things up for non-legal vector types
13786   // here. If both the source and destination are legal, things will just
13787   // work naturally without any fiddling.
13788   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
13789   EVT ResVT = N->getValueType(0);
13790   if (!ResVT.isVector() || TLI.isTypeLegal(ResVT))
13791     return SDValue();
13792   // If the vector type isn't a simple VT, it's beyond the scope of what
13793   // we're  worried about here. Let legalization do its thing and hope for
13794   // the best.
13795   SDValue Src = N->getOperand(0);
13796   EVT SrcVT = Src->getValueType(0);
13797   if (!ResVT.isSimple() || !SrcVT.isSimple())
13798     return SDValue();
13799 
13800   // If the source VT is a 64-bit fixed or scalable vector, we can play games
13801   // and get the better results we want.
13802   if (SrcVT.getSizeInBits().getKnownMinSize() != 64)
13803     return SDValue();
13804 
13805   unsigned SrcEltSize = SrcVT.getScalarSizeInBits();
13806   ElementCount SrcEC = SrcVT.getVectorElementCount();
13807   SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), SrcEC);
13808   SDLoc DL(N);
13809   Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);
13810 
13811   // Now split the rest of the operation into two halves, each with a 64
13812   // bit source.
13813   EVT LoVT, HiVT;
13814   SDValue Lo, Hi;
13815   LoVT = HiVT = ResVT.getHalfNumVectorElementsVT(*DAG.getContext());
13816 
13817   EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
13818                                LoVT.getVectorElementCount());
13819   Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
13820                    DAG.getConstant(0, DL, MVT::i64));
13821   Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
13822                    DAG.getConstant(InNVT.getVectorMinNumElements(), DL, MVT::i64));
13823   Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
13824   Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
13825 
13826   // Now combine the parts back together so we still have a single result
13827   // like the combiner expects.
13828   return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
13829 }
13830 
13831 static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
13832                                SDValue SplatVal, unsigned NumVecElts) {
13833   assert(!St.isTruncatingStore() && "cannot split truncating vector store");
13834   unsigned OrigAlignment = St.getAlignment();
13835   unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
13836 
13837   // Create scalar stores. This is at least as good as the code sequence for a
13838   // split unaligned store which is a dup.s, ext.b, and two stores.
13839   // Most of the time the three stores should be replaced by store pair
13840   // instructions (stp).
13841   SDLoc DL(&St);
13842   SDValue BasePtr = St.getBasePtr();
13843   uint64_t BaseOffset = 0;
13844 
13845   const MachinePointerInfo &PtrInfo = St.getPointerInfo();
13846   SDValue NewST1 =
13847       DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
13848                    OrigAlignment, St.getMemOperand()->getFlags());
13849 
13850   // As this in ISel, we will not merge this add which may degrade results.
13851   if (BasePtr->getOpcode() == ISD::ADD &&
13852       isa<ConstantSDNode>(BasePtr->getOperand(1))) {
13853     BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
13854     BasePtr = BasePtr->getOperand(0);
13855   }
13856 
13857   unsigned Offset = EltOffset;
13858   while (--NumVecElts) {
13859     unsigned Alignment = MinAlign(OrigAlignment, Offset);
13860     SDValue OffsetPtr =
13861         DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
13862                     DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
13863     NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
13864                           PtrInfo.getWithOffset(Offset), Alignment,
13865                           St.getMemOperand()->getFlags());
13866     Offset += EltOffset;
13867   }
13868   return NewST1;
13869 }
13870 
13871 // Returns an SVE type that ContentTy can be trivially sign or zero extended
13872 // into.
13873 static MVT getSVEContainerType(EVT ContentTy) {
13874   assert(ContentTy.isSimple() && "No SVE containers for extended types");
13875 
13876   switch (ContentTy.getSimpleVT().SimpleTy) {
13877   default:
13878     llvm_unreachable("No known SVE container for this MVT type");
13879   case MVT::nxv2i8:
13880   case MVT::nxv2i16:
13881   case MVT::nxv2i32:
13882   case MVT::nxv2i64:
13883   case MVT::nxv2f32:
13884   case MVT::nxv2f64:
13885     return MVT::nxv2i64;
13886   case MVT::nxv4i8:
13887   case MVT::nxv4i16:
13888   case MVT::nxv4i32:
13889   case MVT::nxv4f32:
13890     return MVT::nxv4i32;
13891   case MVT::nxv8i8:
13892   case MVT::nxv8i16:
13893   case MVT::nxv8f16:
13894   case MVT::nxv8bf16:
13895     return MVT::nxv8i16;
13896   case MVT::nxv16i8:
13897     return MVT::nxv16i8;
13898   }
13899 }
13900 
13901 static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
13902   SDLoc DL(N);
13903   EVT VT = N->getValueType(0);
13904 
13905   if (VT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
13906     return SDValue();
13907 
13908   EVT ContainerVT = VT;
13909   if (ContainerVT.isInteger())
13910     ContainerVT = getSVEContainerType(ContainerVT);
13911 
13912   SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
13913   SDValue Ops[] = { N->getOperand(0), // Chain
13914                     N->getOperand(2), // Pg
13915                     N->getOperand(3), // Base
13916                     DAG.getValueType(VT) };
13917 
13918   SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
13919   SDValue LoadChain = SDValue(Load.getNode(), 1);
13920 
13921   if (ContainerVT.isInteger() && (VT != ContainerVT))
13922     Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
13923 
13924   return DAG.getMergeValues({ Load, LoadChain }, DL);
13925 }
13926 
13927 static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
13928   SDLoc DL(N);
13929   EVT VT = N->getValueType(0);
13930   EVT PtrTy = N->getOperand(3).getValueType();
13931 
13932   if (VT == MVT::nxv8bf16 &&
13933       !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
13934     return SDValue();
13935 
13936   EVT LoadVT = VT;
13937   if (VT.isFloatingPoint())
13938     LoadVT = VT.changeTypeToInteger();
13939 
13940   auto *MINode = cast<MemIntrinsicSDNode>(N);
13941   SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
13942   SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
13943                                 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
13944                                 MINode->getOperand(2), PassThru,
13945                                 MINode->getMemoryVT(), MINode->getMemOperand(),
13946                                 ISD::UNINDEXED, ISD::NON_EXTLOAD, false);
13947 
13948    if (VT.isFloatingPoint()) {
13949      SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
13950      return DAG.getMergeValues(Ops, DL);
13951    }
13952 
13953   return L;
13954 }
13955 
13956 template <unsigned Opcode>
13957 static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) {
13958   static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
13959                     Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
13960                 "Unsupported opcode.");
13961   SDLoc DL(N);
13962   EVT VT = N->getValueType(0);
13963   if (VT == MVT::nxv8bf16 &&
13964       !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
13965     return SDValue();
13966 
13967   EVT LoadVT = VT;
13968   if (VT.isFloatingPoint())
13969     LoadVT = VT.changeTypeToInteger();
13970 
13971   SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
13972   SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
13973   SDValue LoadChain = SDValue(Load.getNode(), 1);
13974 
13975   if (VT.isFloatingPoint())
13976     Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
13977 
13978   return DAG.getMergeValues({Load, LoadChain}, DL);
13979 }
13980 
13981 static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) {
13982   SDLoc DL(N);
13983   SDValue Data = N->getOperand(2);
13984   EVT DataVT = Data.getValueType();
13985   EVT HwSrcVt = getSVEContainerType(DataVT);
13986   SDValue InputVT = DAG.getValueType(DataVT);
13987 
13988   if (DataVT == MVT::nxv8bf16 &&
13989       !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
13990     return SDValue();
13991 
13992   if (DataVT.isFloatingPoint())
13993     InputVT = DAG.getValueType(HwSrcVt);
13994 
13995   SDValue SrcNew;
13996   if (Data.getValueType().isFloatingPoint())
13997     SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
13998   else
13999     SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
14000 
14001   SDValue Ops[] = { N->getOperand(0), // Chain
14002                     SrcNew,
14003                     N->getOperand(4), // Base
14004                     N->getOperand(3), // Pg
14005                     InputVT
14006                   };
14007 
14008   return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
14009 }
14010 
14011 static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
14012   SDLoc DL(N);
14013 
14014   SDValue Data = N->getOperand(2);
14015   EVT DataVT = Data.getValueType();
14016   EVT PtrTy = N->getOperand(4).getValueType();
14017 
14018   if (DataVT == MVT::nxv8bf16 &&
14019       !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
14020     return SDValue();
14021 
14022   if (DataVT.isFloatingPoint())
14023     Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);
14024 
14025   auto *MINode = cast<MemIntrinsicSDNode>(N);
14026   return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
14027                             DAG.getUNDEF(PtrTy), MINode->getOperand(3),
14028                             MINode->getMemoryVT(), MINode->getMemOperand(),
14029                             ISD::UNINDEXED, false, false);
14030 }
14031 
14032 /// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.  The
14033 /// load store optimizer pass will merge them to store pair stores.  This should
14034 /// be better than a movi to create the vector zero followed by a vector store
14035 /// if the zero constant is not re-used, since one instructions and one register
14036 /// live range will be removed.
14037 ///
14038 /// For example, the final generated code should be:
14039 ///
14040 ///   stp xzr, xzr, [x0]
14041 ///
14042 /// instead of:
14043 ///
14044 ///   movi v0.2d, #0
14045 ///   str q0, [x0]
14046 ///
14047 static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
14048   SDValue StVal = St.getValue();
14049   EVT VT = StVal.getValueType();
14050 
14051   // Avoid scalarizing zero splat stores for scalable vectors.
14052   if (VT.isScalableVector())
14053     return SDValue();
14054 
14055   // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
14056   // 2, 3 or 4 i32 elements.
14057   int NumVecElts = VT.getVectorNumElements();
14058   if (!(((NumVecElts == 2 || NumVecElts == 3) &&
14059          VT.getVectorElementType().getSizeInBits() == 64) ||
14060         ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
14061          VT.getVectorElementType().getSizeInBits() == 32)))
14062     return SDValue();
14063 
14064   if (StVal.getOpcode() != ISD::BUILD_VECTOR)
14065     return SDValue();
14066 
14067   // If the zero constant has more than one use then the vector store could be
14068   // better since the constant mov will be amortized and stp q instructions
14069   // should be able to be formed.
14070   if (!StVal.hasOneUse())
14071     return SDValue();
14072 
14073   // If the store is truncating then it's going down to i16 or smaller, which
14074   // means it can be implemented in a single store anyway.
14075   if (St.isTruncatingStore())
14076     return SDValue();
14077 
14078   // If the immediate offset of the address operand is too large for the stp
14079   // instruction, then bail out.
14080   if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
14081     int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
14082     if (Offset < -512 || Offset > 504)
14083       return SDValue();
14084   }
14085 
14086   for (int I = 0; I < NumVecElts; ++I) {
14087     SDValue EltVal = StVal.getOperand(I);
14088     if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
14089       return SDValue();
14090   }
14091 
14092   // Use a CopyFromReg WZR/XZR here to prevent
14093   // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
14094   SDLoc DL(&St);
14095   unsigned ZeroReg;
14096   EVT ZeroVT;
14097   if (VT.getVectorElementType().getSizeInBits() == 32) {
14098     ZeroReg = AArch64::WZR;
14099     ZeroVT = MVT::i32;
14100   } else {
14101     ZeroReg = AArch64::XZR;
14102     ZeroVT = MVT::i64;
14103   }
14104   SDValue SplatVal =
14105       DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
14106   return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
14107 }
14108 
14109 /// Replace a splat of a scalar to a vector store by scalar stores of the scalar
14110 /// value. The load store optimizer pass will merge them to store pair stores.
14111 /// This has better performance than a splat of the scalar followed by a split
14112 /// vector store. Even if the stores are not merged it is four stores vs a dup,
14113 /// followed by an ext.b and two stores.
14114 static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
14115   SDValue StVal = St.getValue();
14116   EVT VT = StVal.getValueType();
14117 
14118   // Don't replace floating point stores, they possibly won't be transformed to
14119   // stp because of the store pair suppress pass.
14120   if (VT.isFloatingPoint())
14121     return SDValue();
14122 
14123   // We can express a splat as store pair(s) for 2 or 4 elements.
14124   unsigned NumVecElts = VT.getVectorNumElements();
14125   if (NumVecElts != 4 && NumVecElts != 2)
14126     return SDValue();
14127 
14128   // If the store is truncating then it's going down to i16 or smaller, which
14129   // means it can be implemented in a single store anyway.
14130   if (St.isTruncatingStore())
14131     return SDValue();
14132 
14133   // Check that this is a splat.
14134   // Make sure that each of the relevant vector element locations are inserted
14135   // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
14136   std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
14137   SDValue SplatVal;
14138   for (unsigned I = 0; I < NumVecElts; ++I) {
14139     // Check for insert vector elements.
14140     if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
14141       return SDValue();
14142 
14143     // Check that same value is inserted at each vector element.
14144     if (I == 0)
14145       SplatVal = StVal.getOperand(1);
14146     else if (StVal.getOperand(1) != SplatVal)
14147       return SDValue();
14148 
14149     // Check insert element index.
14150     ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
14151     if (!CIndex)
14152       return SDValue();
14153     uint64_t IndexVal = CIndex->getZExtValue();
14154     if (IndexVal >= NumVecElts)
14155       return SDValue();
14156     IndexNotInserted.reset(IndexVal);
14157 
14158     StVal = StVal.getOperand(0);
14159   }
14160   // Check that all vector element locations were inserted to.
14161   if (IndexNotInserted.any())
14162       return SDValue();
14163 
14164   return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
14165 }
14166 
14167 static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
14168                            SelectionDAG &DAG,
14169                            const AArch64Subtarget *Subtarget) {
14170 
14171   StoreSDNode *S = cast<StoreSDNode>(N);
14172   if (S->isVolatile() || S->isIndexed())
14173     return SDValue();
14174 
14175   SDValue StVal = S->getValue();
14176   EVT VT = StVal.getValueType();
14177 
14178   if (!VT.isFixedLengthVector())
14179     return SDValue();
14180 
14181   // If we get a splat of zeros, convert this vector store to a store of
14182   // scalars. They will be merged into store pairs of xzr thereby removing one
14183   // instruction and one register.
14184   if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
14185     return ReplacedZeroSplat;
14186 
14187   // FIXME: The logic for deciding if an unaligned store should be split should
14188   // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
14189   // a call to that function here.
14190 
14191   if (!Subtarget->isMisaligned128StoreSlow())
14192     return SDValue();
14193 
14194   // Don't split at -Oz.
14195   if (DAG.getMachineFunction().getFunction().hasMinSize())
14196     return SDValue();
14197 
14198   // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
14199   // those up regresses performance on micro-benchmarks and olden/bh.
14200   if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
14201     return SDValue();
14202 
14203   // Split unaligned 16B stores. They are terrible for performance.
14204   // Don't split stores with alignment of 1 or 2. Code that uses clang vector
14205   // extensions can use this to mark that it does not want splitting to happen
14206   // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
14207   // eliminating alignment hazards is only 1 in 8 for alignment of 2.
14208   if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
14209       S->getAlignment() <= 2)
14210     return SDValue();
14211 
14212   // If we get a splat of a scalar convert this vector store to a store of
14213   // scalars. They will be merged into store pairs thereby removing two
14214   // instructions.
14215   if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
14216     return ReplacedSplat;
14217 
14218   SDLoc DL(S);
14219 
14220   // Split VT into two.
14221   EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
14222   unsigned NumElts = HalfVT.getVectorNumElements();
14223   SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
14224                                    DAG.getConstant(0, DL, MVT::i64));
14225   SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
14226                                    DAG.getConstant(NumElts, DL, MVT::i64));
14227   SDValue BasePtr = S->getBasePtr();
14228   SDValue NewST1 =
14229       DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
14230                    S->getAlignment(), S->getMemOperand()->getFlags());
14231   SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
14232                                   DAG.getConstant(8, DL, MVT::i64));
14233   return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
14234                       S->getPointerInfo(), S->getAlignment(),
14235                       S->getMemOperand()->getFlags());
14236 }
14237 
14238 static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG) {
14239   SDLoc DL(N);
14240   SDValue Op0 = N->getOperand(0);
14241   SDValue Op1 = N->getOperand(1);
14242   EVT ResVT = N->getValueType(0);
14243 
14244   // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
14245   if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
14246     if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
14247       SDValue X = Op0.getOperand(0).getOperand(0);
14248       return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
14249     }
14250   }
14251 
14252   // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
14253   if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
14254     if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
14255       SDValue Z = Op1.getOperand(0).getOperand(1);
14256       return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
14257     }
14258   }
14259 
14260   return SDValue();
14261 }
14262 
14263 /// Target-specific DAG combine function for post-increment LD1 (lane) and
14264 /// post-increment LD1R.
14265 static SDValue performPostLD1Combine(SDNode *N,
14266                                      TargetLowering::DAGCombinerInfo &DCI,
14267                                      bool IsLaneOp) {
14268   if (DCI.isBeforeLegalizeOps())
14269     return SDValue();
14270 
14271   SelectionDAG &DAG = DCI.DAG;
14272   EVT VT = N->getValueType(0);
14273 
14274   if (VT.isScalableVector())
14275     return SDValue();
14276 
14277   unsigned LoadIdx = IsLaneOp ? 1 : 0;
14278   SDNode *LD = N->getOperand(LoadIdx).getNode();
14279   // If it is not LOAD, can not do such combine.
14280   if (LD->getOpcode() != ISD::LOAD)
14281     return SDValue();
14282 
14283   // The vector lane must be a constant in the LD1LANE opcode.
14284   SDValue Lane;
14285   if (IsLaneOp) {
14286     Lane = N->getOperand(2);
14287     auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
14288     if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
14289       return SDValue();
14290   }
14291 
14292   LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
14293   EVT MemVT = LoadSDN->getMemoryVT();
14294   // Check if memory operand is the same type as the vector element.
14295   if (MemVT != VT.getVectorElementType())
14296     return SDValue();
14297 
14298   // Check if there are other uses. If so, do not combine as it will introduce
14299   // an extra load.
14300   for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
14301        ++UI) {
14302     if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
14303       continue;
14304     if (*UI != N)
14305       return SDValue();
14306   }
14307 
14308   SDValue Addr = LD->getOperand(1);
14309   SDValue Vector = N->getOperand(0);
14310   // Search for a use of the address operand that is an increment.
14311   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
14312        Addr.getNode()->use_end(); UI != UE; ++UI) {
14313     SDNode *User = *UI;
14314     if (User->getOpcode() != ISD::ADD
14315         || UI.getUse().getResNo() != Addr.getResNo())
14316       continue;
14317 
14318     // If the increment is a constant, it must match the memory ref size.
14319     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
14320     if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
14321       uint32_t IncVal = CInc->getZExtValue();
14322       unsigned NumBytes = VT.getScalarSizeInBits() / 8;
14323       if (IncVal != NumBytes)
14324         continue;
14325       Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
14326     }
14327 
14328     // To avoid cycle construction make sure that neither the load nor the add
14329     // are predecessors to each other or the Vector.
14330     SmallPtrSet<const SDNode *, 32> Visited;
14331     SmallVector<const SDNode *, 16> Worklist;
14332     Visited.insert(Addr.getNode());
14333     Worklist.push_back(User);
14334     Worklist.push_back(LD);
14335     Worklist.push_back(Vector.getNode());
14336     if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
14337         SDNode::hasPredecessorHelper(User, Visited, Worklist))
14338       continue;
14339 
14340     SmallVector<SDValue, 8> Ops;
14341     Ops.push_back(LD->getOperand(0));  // Chain
14342     if (IsLaneOp) {
14343       Ops.push_back(Vector);           // The vector to be inserted
14344       Ops.push_back(Lane);             // The lane to be inserted in the vector
14345     }
14346     Ops.push_back(Addr);
14347     Ops.push_back(Inc);
14348 
14349     EVT Tys[3] = { VT, MVT::i64, MVT::Other };
14350     SDVTList SDTys = DAG.getVTList(Tys);
14351     unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
14352     SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
14353                                            MemVT,
14354                                            LoadSDN->getMemOperand());
14355 
14356     // Update the uses.
14357     SDValue NewResults[] = {
14358         SDValue(LD, 0),            // The result of load
14359         SDValue(UpdN.getNode(), 2) // Chain
14360     };
14361     DCI.CombineTo(LD, NewResults);
14362     DCI.CombineTo(N, SDValue(UpdN.getNode(), 0));     // Dup/Inserted Result
14363     DCI.CombineTo(User, SDValue(UpdN.getNode(), 1));  // Write back register
14364 
14365     break;
14366   }
14367   return SDValue();
14368 }
14369 
14370 /// Simplify ``Addr`` given that the top byte of it is ignored by HW during
14371 /// address translation.
14372 static bool performTBISimplification(SDValue Addr,
14373                                      TargetLowering::DAGCombinerInfo &DCI,
14374                                      SelectionDAG &DAG) {
14375   APInt DemandedMask = APInt::getLowBitsSet(64, 56);
14376   KnownBits Known;
14377   TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
14378                                         !DCI.isBeforeLegalizeOps());
14379   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14380   if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
14381     DCI.CommitTargetLoweringOpt(TLO);
14382     return true;
14383   }
14384   return false;
14385 }
14386 
14387 static SDValue performSTORECombine(SDNode *N,
14388                                    TargetLowering::DAGCombinerInfo &DCI,
14389                                    SelectionDAG &DAG,
14390                                    const AArch64Subtarget *Subtarget) {
14391   if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
14392     return Split;
14393 
14394   if (Subtarget->supportsAddressTopByteIgnored() &&
14395       performTBISimplification(N->getOperand(2), DCI, DAG))
14396     return SDValue(N, 0);
14397 
14398   return SDValue();
14399 }
14400 
14401 static SDValue performMaskedGatherScatterCombine(SDNode *N,
14402                                       TargetLowering::DAGCombinerInfo &DCI,
14403                                       SelectionDAG &DAG) {
14404   MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
14405   assert(MGS && "Can only combine gather load or scatter store nodes");
14406 
14407   SDLoc DL(MGS);
14408   SDValue Chain = MGS->getChain();
14409   SDValue Scale = MGS->getScale();
14410   SDValue Index = MGS->getIndex();
14411   SDValue Mask = MGS->getMask();
14412   SDValue BasePtr = MGS->getBasePtr();
14413   ISD::MemIndexType IndexType = MGS->getIndexType();
14414 
14415   EVT IdxVT = Index.getValueType();
14416 
14417   if (DCI.isBeforeLegalize()) {
14418     // SVE gather/scatter requires indices of i32/i64. Promote anything smaller
14419     // prior to legalisation so the result can be split if required.
14420     if ((IdxVT.getVectorElementType() == MVT::i8) ||
14421         (IdxVT.getVectorElementType() == MVT::i16)) {
14422       EVT NewIdxVT = IdxVT.changeVectorElementType(MVT::i32);
14423       if (MGS->isIndexSigned())
14424         Index = DAG.getNode(ISD::SIGN_EXTEND, DL, NewIdxVT, Index);
14425       else
14426         Index = DAG.getNode(ISD::ZERO_EXTEND, DL, NewIdxVT, Index);
14427 
14428       if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
14429         SDValue PassThru = MGT->getPassThru();
14430         SDValue Ops[] = { Chain, PassThru, Mask, BasePtr, Index, Scale };
14431         return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other),
14432                                    PassThru.getValueType(), DL, Ops,
14433                                    MGT->getMemOperand(),
14434                                    MGT->getIndexType(), MGT->getExtensionType());
14435       } else {
14436         auto *MSC = cast<MaskedScatterSDNode>(MGS);
14437         SDValue Data = MSC->getValue();
14438         SDValue Ops[] = { Chain, Data, Mask, BasePtr, Index, Scale };
14439         return DAG.getMaskedScatter(DAG.getVTList(MVT::Other),
14440                                     MSC->getMemoryVT(), DL, Ops,
14441                                     MSC->getMemOperand(), IndexType,
14442                                     MSC->isTruncatingStore());
14443       }
14444     }
14445   }
14446 
14447   return SDValue();
14448 }
14449 
14450 /// Target-specific DAG combine function for NEON load/store intrinsics
14451 /// to merge base address updates.
14452 static SDValue performNEONPostLDSTCombine(SDNode *N,
14453                                           TargetLowering::DAGCombinerInfo &DCI,
14454                                           SelectionDAG &DAG) {
14455   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14456     return SDValue();
14457 
14458   unsigned AddrOpIdx = N->getNumOperands() - 1;
14459   SDValue Addr = N->getOperand(AddrOpIdx);
14460 
14461   // Search for a use of the address operand that is an increment.
14462   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
14463        UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
14464     SDNode *User = *UI;
14465     if (User->getOpcode() != ISD::ADD ||
14466         UI.getUse().getResNo() != Addr.getResNo())
14467       continue;
14468 
14469     // Check that the add is independent of the load/store.  Otherwise, folding
14470     // it would create a cycle.
14471     SmallPtrSet<const SDNode *, 32> Visited;
14472     SmallVector<const SDNode *, 16> Worklist;
14473     Visited.insert(Addr.getNode());
14474     Worklist.push_back(N);
14475     Worklist.push_back(User);
14476     if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
14477         SDNode::hasPredecessorHelper(User, Visited, Worklist))
14478       continue;
14479 
14480     // Find the new opcode for the updating load/store.
14481     bool IsStore = false;
14482     bool IsLaneOp = false;
14483     bool IsDupOp = false;
14484     unsigned NewOpc = 0;
14485     unsigned NumVecs = 0;
14486     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
14487     switch (IntNo) {
14488     default: llvm_unreachable("unexpected intrinsic for Neon base update");
14489     case Intrinsic::aarch64_neon_ld2:       NewOpc = AArch64ISD::LD2post;
14490       NumVecs = 2; break;
14491     case Intrinsic::aarch64_neon_ld3:       NewOpc = AArch64ISD::LD3post;
14492       NumVecs = 3; break;
14493     case Intrinsic::aarch64_neon_ld4:       NewOpc = AArch64ISD::LD4post;
14494       NumVecs = 4; break;
14495     case Intrinsic::aarch64_neon_st2:       NewOpc = AArch64ISD::ST2post;
14496       NumVecs = 2; IsStore = true; break;
14497     case Intrinsic::aarch64_neon_st3:       NewOpc = AArch64ISD::ST3post;
14498       NumVecs = 3; IsStore = true; break;
14499     case Intrinsic::aarch64_neon_st4:       NewOpc = AArch64ISD::ST4post;
14500       NumVecs = 4; IsStore = true; break;
14501     case Intrinsic::aarch64_neon_ld1x2:     NewOpc = AArch64ISD::LD1x2post;
14502       NumVecs = 2; break;
14503     case Intrinsic::aarch64_neon_ld1x3:     NewOpc = AArch64ISD::LD1x3post;
14504       NumVecs = 3; break;
14505     case Intrinsic::aarch64_neon_ld1x4:     NewOpc = AArch64ISD::LD1x4post;
14506       NumVecs = 4; break;
14507     case Intrinsic::aarch64_neon_st1x2:     NewOpc = AArch64ISD::ST1x2post;
14508       NumVecs = 2; IsStore = true; break;
14509     case Intrinsic::aarch64_neon_st1x3:     NewOpc = AArch64ISD::ST1x3post;
14510       NumVecs = 3; IsStore = true; break;
14511     case Intrinsic::aarch64_neon_st1x4:     NewOpc = AArch64ISD::ST1x4post;
14512       NumVecs = 4; IsStore = true; break;
14513     case Intrinsic::aarch64_neon_ld2r:      NewOpc = AArch64ISD::LD2DUPpost;
14514       NumVecs = 2; IsDupOp = true; break;
14515     case Intrinsic::aarch64_neon_ld3r:      NewOpc = AArch64ISD::LD3DUPpost;
14516       NumVecs = 3; IsDupOp = true; break;
14517     case Intrinsic::aarch64_neon_ld4r:      NewOpc = AArch64ISD::LD4DUPpost;
14518       NumVecs = 4; IsDupOp = true; break;
14519     case Intrinsic::aarch64_neon_ld2lane:   NewOpc = AArch64ISD::LD2LANEpost;
14520       NumVecs = 2; IsLaneOp = true; break;
14521     case Intrinsic::aarch64_neon_ld3lane:   NewOpc = AArch64ISD::LD3LANEpost;
14522       NumVecs = 3; IsLaneOp = true; break;
14523     case Intrinsic::aarch64_neon_ld4lane:   NewOpc = AArch64ISD::LD4LANEpost;
14524       NumVecs = 4; IsLaneOp = true; break;
14525     case Intrinsic::aarch64_neon_st2lane:   NewOpc = AArch64ISD::ST2LANEpost;
14526       NumVecs = 2; IsStore = true; IsLaneOp = true; break;
14527     case Intrinsic::aarch64_neon_st3lane:   NewOpc = AArch64ISD::ST3LANEpost;
14528       NumVecs = 3; IsStore = true; IsLaneOp = true; break;
14529     case Intrinsic::aarch64_neon_st4lane:   NewOpc = AArch64ISD::ST4LANEpost;
14530       NumVecs = 4; IsStore = true; IsLaneOp = true; break;
14531     }
14532 
14533     EVT VecTy;
14534     if (IsStore)
14535       VecTy = N->getOperand(2).getValueType();
14536     else
14537       VecTy = N->getValueType(0);
14538 
14539     // If the increment is a constant, it must match the memory ref size.
14540     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
14541     if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
14542       uint32_t IncVal = CInc->getZExtValue();
14543       unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
14544       if (IsLaneOp || IsDupOp)
14545         NumBytes /= VecTy.getVectorNumElements();
14546       if (IncVal != NumBytes)
14547         continue;
14548       Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
14549     }
14550     SmallVector<SDValue, 8> Ops;
14551     Ops.push_back(N->getOperand(0)); // Incoming chain
14552     // Load lane and store have vector list as input.
14553     if (IsLaneOp || IsStore)
14554       for (unsigned i = 2; i < AddrOpIdx; ++i)
14555         Ops.push_back(N->getOperand(i));
14556     Ops.push_back(Addr); // Base register
14557     Ops.push_back(Inc);
14558 
14559     // Return Types.
14560     EVT Tys[6];
14561     unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
14562     unsigned n;
14563     for (n = 0; n < NumResultVecs; ++n)
14564       Tys[n] = VecTy;
14565     Tys[n++] = MVT::i64;  // Type of write back register
14566     Tys[n] = MVT::Other;  // Type of the chain
14567     SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
14568 
14569     MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
14570     SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
14571                                            MemInt->getMemoryVT(),
14572                                            MemInt->getMemOperand());
14573 
14574     // Update the uses.
14575     std::vector<SDValue> NewResults;
14576     for (unsigned i = 0; i < NumResultVecs; ++i) {
14577       NewResults.push_back(SDValue(UpdN.getNode(), i));
14578     }
14579     NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
14580     DCI.CombineTo(N, NewResults);
14581     DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
14582 
14583     break;
14584   }
14585   return SDValue();
14586 }
14587 
14588 // Checks to see if the value is the prescribed width and returns information
14589 // about its extension mode.
14590 static
14591 bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
14592   ExtType = ISD::NON_EXTLOAD;
14593   switch(V.getNode()->getOpcode()) {
14594   default:
14595     return false;
14596   case ISD::LOAD: {
14597     LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
14598     if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
14599        || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
14600       ExtType = LoadNode->getExtensionType();
14601       return true;
14602     }
14603     return false;
14604   }
14605   case ISD::AssertSext: {
14606     VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
14607     if ((TypeNode->getVT() == MVT::i8 && width == 8)
14608        || (TypeNode->getVT() == MVT::i16 && width == 16)) {
14609       ExtType = ISD::SEXTLOAD;
14610       return true;
14611     }
14612     return false;
14613   }
14614   case ISD::AssertZext: {
14615     VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
14616     if ((TypeNode->getVT() == MVT::i8 && width == 8)
14617        || (TypeNode->getVT() == MVT::i16 && width == 16)) {
14618       ExtType = ISD::ZEXTLOAD;
14619       return true;
14620     }
14621     return false;
14622   }
14623   case ISD::Constant:
14624   case ISD::TargetConstant: {
14625     return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
14626            1LL << (width - 1);
14627   }
14628   }
14629 
14630   return true;
14631 }
14632 
14633 // This function does a whole lot of voodoo to determine if the tests are
14634 // equivalent without and with a mask. Essentially what happens is that given a
14635 // DAG resembling:
14636 //
14637 //  +-------------+ +-------------+ +-------------+ +-------------+
14638 //  |    Input    | | AddConstant | | CompConstant| |     CC      |
14639 //  +-------------+ +-------------+ +-------------+ +-------------+
14640 //           |           |           |               |
14641 //           V           V           |    +----------+
14642 //          +-------------+  +----+  |    |
14643 //          |     ADD     |  |0xff|  |    |
14644 //          +-------------+  +----+  |    |
14645 //                  |           |    |    |
14646 //                  V           V    |    |
14647 //                 +-------------+   |    |
14648 //                 |     AND     |   |    |
14649 //                 +-------------+   |    |
14650 //                      |            |    |
14651 //                      +-----+      |    |
14652 //                            |      |    |
14653 //                            V      V    V
14654 //                           +-------------+
14655 //                           |     CMP     |
14656 //                           +-------------+
14657 //
14658 // The AND node may be safely removed for some combinations of inputs. In
14659 // particular we need to take into account the extension type of the Input,
14660 // the exact values of AddConstant, CompConstant, and CC, along with the nominal
14661 // width of the input (this can work for any width inputs, the above graph is
14662 // specific to 8 bits.
14663 //
14664 // The specific equations were worked out by generating output tables for each
14665 // AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
14666 // problem was simplified by working with 4 bit inputs, which means we only
14667 // needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
14668 // extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
14669 // patterns present in both extensions (0,7). For every distinct set of
14670 // AddConstant and CompConstants bit patterns we can consider the masked and
14671 // unmasked versions to be equivalent if the result of this function is true for
14672 // all 16 distinct bit patterns of for the current extension type of Input (w0).
14673 //
14674 //   sub      w8, w0, w1
14675 //   and      w10, w8, #0x0f
14676 //   cmp      w8, w2
14677 //   cset     w9, AArch64CC
14678 //   cmp      w10, w2
14679 //   cset     w11, AArch64CC
14680 //   cmp      w9, w11
14681 //   cset     w0, eq
14682 //   ret
14683 //
14684 // Since the above function shows when the outputs are equivalent it defines
14685 // when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
14686 // would be expensive to run during compiles. The equations below were written
14687 // in a test harness that confirmed they gave equivalent outputs to the above
14688 // for all inputs function, so they can be used determine if the removal is
14689 // legal instead.
14690 //
14691 // isEquivalentMaskless() is the code for testing if the AND can be removed
14692 // factored out of the DAG recognition as the DAG can take several forms.
14693 
14694 static bool isEquivalentMaskless(unsigned CC, unsigned width,
14695                                  ISD::LoadExtType ExtType, int AddConstant,
14696                                  int CompConstant) {
14697   // By being careful about our equations and only writing the in term
14698   // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
14699   // make them generally applicable to all bit widths.
14700   int MaxUInt = (1 << width);
14701 
14702   // For the purposes of these comparisons sign extending the type is
14703   // equivalent to zero extending the add and displacing it by half the integer
14704   // width. Provided we are careful and make sure our equations are valid over
14705   // the whole range we can just adjust the input and avoid writing equations
14706   // for sign extended inputs.
14707   if (ExtType == ISD::SEXTLOAD)
14708     AddConstant -= (1 << (width-1));
14709 
14710   switch(CC) {
14711   case AArch64CC::LE:
14712   case AArch64CC::GT:
14713     if ((AddConstant == 0) ||
14714         (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
14715         (AddConstant >= 0 && CompConstant < 0) ||
14716         (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
14717       return true;
14718     break;
14719   case AArch64CC::LT:
14720   case AArch64CC::GE:
14721     if ((AddConstant == 0) ||
14722         (AddConstant >= 0 && CompConstant <= 0) ||
14723         (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
14724       return true;
14725     break;
14726   case AArch64CC::HI:
14727   case AArch64CC::LS:
14728     if ((AddConstant >= 0 && CompConstant < 0) ||
14729        (AddConstant <= 0 && CompConstant >= -1 &&
14730         CompConstant < AddConstant + MaxUInt))
14731       return true;
14732    break;
14733   case AArch64CC::PL:
14734   case AArch64CC::MI:
14735     if ((AddConstant == 0) ||
14736         (AddConstant > 0 && CompConstant <= 0) ||
14737         (AddConstant < 0 && CompConstant <= AddConstant))
14738       return true;
14739     break;
14740   case AArch64CC::LO:
14741   case AArch64CC::HS:
14742     if ((AddConstant >= 0 && CompConstant <= 0) ||
14743         (AddConstant <= 0 && CompConstant >= 0 &&
14744          CompConstant <= AddConstant + MaxUInt))
14745       return true;
14746     break;
14747   case AArch64CC::EQ:
14748   case AArch64CC::NE:
14749     if ((AddConstant > 0 && CompConstant < 0) ||
14750         (AddConstant < 0 && CompConstant >= 0 &&
14751          CompConstant < AddConstant + MaxUInt) ||
14752         (AddConstant >= 0 && CompConstant >= 0 &&
14753          CompConstant >= AddConstant) ||
14754         (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
14755       return true;
14756     break;
14757   case AArch64CC::VS:
14758   case AArch64CC::VC:
14759   case AArch64CC::AL:
14760   case AArch64CC::NV:
14761     return true;
14762   case AArch64CC::Invalid:
14763     break;
14764   }
14765 
14766   return false;
14767 }
14768 
14769 static
14770 SDValue performCONDCombine(SDNode *N,
14771                            TargetLowering::DAGCombinerInfo &DCI,
14772                            SelectionDAG &DAG, unsigned CCIndex,
14773                            unsigned CmpIndex) {
14774   unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
14775   SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
14776   unsigned CondOpcode = SubsNode->getOpcode();
14777 
14778   if (CondOpcode != AArch64ISD::SUBS)
14779     return SDValue();
14780 
14781   // There is a SUBS feeding this condition. Is it fed by a mask we can
14782   // use?
14783 
14784   SDNode *AndNode = SubsNode->getOperand(0).getNode();
14785   unsigned MaskBits = 0;
14786 
14787   if (AndNode->getOpcode() != ISD::AND)
14788     return SDValue();
14789 
14790   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
14791     uint32_t CNV = CN->getZExtValue();
14792     if (CNV == 255)
14793       MaskBits = 8;
14794     else if (CNV == 65535)
14795       MaskBits = 16;
14796   }
14797 
14798   if (!MaskBits)
14799     return SDValue();
14800 
14801   SDValue AddValue = AndNode->getOperand(0);
14802 
14803   if (AddValue.getOpcode() != ISD::ADD)
14804     return SDValue();
14805 
14806   // The basic dag structure is correct, grab the inputs and validate them.
14807 
14808   SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
14809   SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
14810   SDValue SubsInputValue = SubsNode->getOperand(1);
14811 
14812   // The mask is present and the provenance of all the values is a smaller type,
14813   // lets see if the mask is superfluous.
14814 
14815   if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
14816       !isa<ConstantSDNode>(SubsInputValue.getNode()))
14817     return SDValue();
14818 
14819   ISD::LoadExtType ExtType;
14820 
14821   if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
14822       !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
14823       !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
14824     return SDValue();
14825 
14826   if(!isEquivalentMaskless(CC, MaskBits, ExtType,
14827                 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
14828                 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
14829     return SDValue();
14830 
14831   // The AND is not necessary, remove it.
14832 
14833   SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
14834                                SubsNode->getValueType(1));
14835   SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
14836 
14837   SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
14838   DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
14839 
14840   return SDValue(N, 0);
14841 }
14842 
14843 // Optimize compare with zero and branch.
14844 static SDValue performBRCONDCombine(SDNode *N,
14845                                     TargetLowering::DAGCombinerInfo &DCI,
14846                                     SelectionDAG &DAG) {
14847   MachineFunction &MF = DAG.getMachineFunction();
14848   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
14849   // will not be produced, as they are conditional branch instructions that do
14850   // not set flags.
14851   if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
14852     return SDValue();
14853 
14854   if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
14855     N = NV.getNode();
14856   SDValue Chain = N->getOperand(0);
14857   SDValue Dest = N->getOperand(1);
14858   SDValue CCVal = N->getOperand(2);
14859   SDValue Cmp = N->getOperand(3);
14860 
14861   assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
14862   unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
14863   if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
14864     return SDValue();
14865 
14866   unsigned CmpOpc = Cmp.getOpcode();
14867   if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
14868     return SDValue();
14869 
14870   // Only attempt folding if there is only one use of the flag and no use of the
14871   // value.
14872   if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
14873     return SDValue();
14874 
14875   SDValue LHS = Cmp.getOperand(0);
14876   SDValue RHS = Cmp.getOperand(1);
14877 
14878   assert(LHS.getValueType() == RHS.getValueType() &&
14879          "Expected the value type to be the same for both operands!");
14880   if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
14881     return SDValue();
14882 
14883   if (isNullConstant(LHS))
14884     std::swap(LHS, RHS);
14885 
14886   if (!isNullConstant(RHS))
14887     return SDValue();
14888 
14889   if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
14890       LHS.getOpcode() == ISD::SRL)
14891     return SDValue();
14892 
14893   // Fold the compare into the branch instruction.
14894   SDValue BR;
14895   if (CC == AArch64CC::EQ)
14896     BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
14897   else
14898     BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
14899 
14900   // Do not add new nodes to DAG combiner worklist.
14901   DCI.CombineTo(N, BR, false);
14902 
14903   return SDValue();
14904 }
14905 
14906 // Optimize some simple tbz/tbnz cases.  Returns the new operand and bit to test
14907 // as well as whether the test should be inverted.  This code is required to
14908 // catch these cases (as opposed to standard dag combines) because
14909 // AArch64ISD::TBZ is matched during legalization.
14910 static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
14911                                  SelectionDAG &DAG) {
14912 
14913   if (!Op->hasOneUse())
14914     return Op;
14915 
14916   // We don't handle undef/constant-fold cases below, as they should have
14917   // already been taken care of (e.g. and of 0, test of undefined shifted bits,
14918   // etc.)
14919 
14920   // (tbz (trunc x), b) -> (tbz x, b)
14921   // This case is just here to enable more of the below cases to be caught.
14922   if (Op->getOpcode() == ISD::TRUNCATE &&
14923       Bit < Op->getValueType(0).getSizeInBits()) {
14924     return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
14925   }
14926 
14927   // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
14928   if (Op->getOpcode() == ISD::ANY_EXTEND &&
14929       Bit < Op->getOperand(0).getValueSizeInBits()) {
14930     return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
14931   }
14932 
14933   if (Op->getNumOperands() != 2)
14934     return Op;
14935 
14936   auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14937   if (!C)
14938     return Op;
14939 
14940   switch (Op->getOpcode()) {
14941   default:
14942     return Op;
14943 
14944   // (tbz (and x, m), b) -> (tbz x, b)
14945   case ISD::AND:
14946     if ((C->getZExtValue() >> Bit) & 1)
14947       return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
14948     return Op;
14949 
14950   // (tbz (shl x, c), b) -> (tbz x, b-c)
14951   case ISD::SHL:
14952     if (C->getZExtValue() <= Bit &&
14953         (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
14954       Bit = Bit - C->getZExtValue();
14955       return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
14956     }
14957     return Op;
14958 
14959   // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
14960   case ISD::SRA:
14961     Bit = Bit + C->getZExtValue();
14962     if (Bit >= Op->getValueType(0).getSizeInBits())
14963       Bit = Op->getValueType(0).getSizeInBits() - 1;
14964     return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
14965 
14966   // (tbz (srl x, c), b) -> (tbz x, b+c)
14967   case ISD::SRL:
14968     if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
14969       Bit = Bit + C->getZExtValue();
14970       return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
14971     }
14972     return Op;
14973 
14974   // (tbz (xor x, -1), b) -> (tbnz x, b)
14975   case ISD::XOR:
14976     if ((C->getZExtValue() >> Bit) & 1)
14977       Invert = !Invert;
14978     return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
14979   }
14980 }
14981 
14982 // Optimize test single bit zero/non-zero and branch.
14983 static SDValue performTBZCombine(SDNode *N,
14984                                  TargetLowering::DAGCombinerInfo &DCI,
14985                                  SelectionDAG &DAG) {
14986   unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
14987   bool Invert = false;
14988   SDValue TestSrc = N->getOperand(1);
14989   SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
14990 
14991   if (TestSrc == NewTestSrc)
14992     return SDValue();
14993 
14994   unsigned NewOpc = N->getOpcode();
14995   if (Invert) {
14996     if (NewOpc == AArch64ISD::TBZ)
14997       NewOpc = AArch64ISD::TBNZ;
14998     else {
14999       assert(NewOpc == AArch64ISD::TBNZ);
15000       NewOpc = AArch64ISD::TBZ;
15001     }
15002   }
15003 
15004   SDLoc DL(N);
15005   return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
15006                      DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
15007 }
15008 
15009 // vselect (v1i1 setcc) ->
15010 //     vselect (v1iXX setcc)  (XX is the size of the compared operand type)
15011 // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
15012 // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
15013 // such VSELECT.
15014 static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
15015   SDValue N0 = N->getOperand(0);
15016   EVT CCVT = N0.getValueType();
15017 
15018   if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 ||
15019       CCVT.getVectorElementType() != MVT::i1)
15020     return SDValue();
15021 
15022   EVT ResVT = N->getValueType(0);
15023   EVT CmpVT = N0.getOperand(0).getValueType();
15024   // Only combine when the result type is of the same size as the compared
15025   // operands.
15026   if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
15027     return SDValue();
15028 
15029   SDValue IfTrue = N->getOperand(1);
15030   SDValue IfFalse = N->getOperand(2);
15031   SDValue SetCC =
15032       DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
15033                    N0.getOperand(0), N0.getOperand(1),
15034                    cast<CondCodeSDNode>(N0.getOperand(2))->get());
15035   return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
15036                      IfTrue, IfFalse);
15037 }
15038 
15039 /// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
15040 /// the compare-mask instructions rather than going via NZCV, even if LHS and
15041 /// RHS are really scalar. This replaces any scalar setcc in the above pattern
15042 /// with a vector one followed by a DUP shuffle on the result.
15043 static SDValue performSelectCombine(SDNode *N,
15044                                     TargetLowering::DAGCombinerInfo &DCI) {
15045   SelectionDAG &DAG = DCI.DAG;
15046   SDValue N0 = N->getOperand(0);
15047   EVT ResVT = N->getValueType(0);
15048 
15049   if (N0.getOpcode() != ISD::SETCC)
15050     return SDValue();
15051 
15052   // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
15053   // scalar SetCCResultType. We also don't expect vectors, because we assume
15054   // that selects fed by vector SETCCs are canonicalized to VSELECT.
15055   assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
15056          "Scalar-SETCC feeding SELECT has unexpected result type!");
15057 
15058   // If NumMaskElts == 0, the comparison is larger than select result. The
15059   // largest real NEON comparison is 64-bits per lane, which means the result is
15060   // at most 32-bits and an illegal vector. Just bail out for now.
15061   EVT SrcVT = N0.getOperand(0).getValueType();
15062 
15063   // Don't try to do this optimization when the setcc itself has i1 operands.
15064   // There are no legal vectors of i1, so this would be pointless.
15065   if (SrcVT == MVT::i1)
15066     return SDValue();
15067 
15068   int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
15069   if (!ResVT.isVector() || NumMaskElts == 0)
15070     return SDValue();
15071 
15072   SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
15073   EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
15074 
15075   // Also bail out if the vector CCVT isn't the same size as ResVT.
15076   // This can happen if the SETCC operand size doesn't divide the ResVT size
15077   // (e.g., f64 vs v3f32).
15078   if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
15079     return SDValue();
15080 
15081   // Make sure we didn't create illegal types, if we're not supposed to.
15082   assert(DCI.isBeforeLegalize() ||
15083          DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
15084 
15085   // First perform a vector comparison, where lane 0 is the one we're interested
15086   // in.
15087   SDLoc DL(N0);
15088   SDValue LHS =
15089       DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
15090   SDValue RHS =
15091       DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
15092   SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
15093 
15094   // Now duplicate the comparison mask we want across all other lanes.
15095   SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
15096   SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
15097   Mask = DAG.getNode(ISD::BITCAST, DL,
15098                      ResVT.changeVectorElementTypeToInteger(), Mask);
15099 
15100   return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
15101 }
15102 
15103 /// Get rid of unnecessary NVCASTs (that don't change the type).
15104 static SDValue performNVCASTCombine(SDNode *N) {
15105   if (N->getValueType(0) == N->getOperand(0).getValueType())
15106     return N->getOperand(0);
15107 
15108   return SDValue();
15109 }
15110 
15111 // If all users of the globaladdr are of the form (globaladdr + constant), find
15112 // the smallest constant, fold it into the globaladdr's offset and rewrite the
15113 // globaladdr as (globaladdr + constant) - constant.
15114 static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
15115                                            const AArch64Subtarget *Subtarget,
15116                                            const TargetMachine &TM) {
15117   auto *GN = cast<GlobalAddressSDNode>(N);
15118   if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
15119       AArch64II::MO_NO_FLAG)
15120     return SDValue();
15121 
15122   uint64_t MinOffset = -1ull;
15123   for (SDNode *N : GN->uses()) {
15124     if (N->getOpcode() != ISD::ADD)
15125       return SDValue();
15126     auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
15127     if (!C)
15128       C = dyn_cast<ConstantSDNode>(N->getOperand(1));
15129     if (!C)
15130       return SDValue();
15131     MinOffset = std::min(MinOffset, C->getZExtValue());
15132   }
15133   uint64_t Offset = MinOffset + GN->getOffset();
15134 
15135   // Require that the new offset is larger than the existing one. Otherwise, we
15136   // can end up oscillating between two possible DAGs, for example,
15137   // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
15138   if (Offset <= uint64_t(GN->getOffset()))
15139     return SDValue();
15140 
15141   // Check whether folding this offset is legal. It must not go out of bounds of
15142   // the referenced object to avoid violating the code model, and must be
15143   // smaller than 2^21 because this is the largest offset expressible in all
15144   // object formats.
15145   //
15146   // This check also prevents us from folding negative offsets, which will end
15147   // up being treated in the same way as large positive ones. They could also
15148   // cause code model violations, and aren't really common enough to matter.
15149   if (Offset >= (1 << 21))
15150     return SDValue();
15151 
15152   const GlobalValue *GV = GN->getGlobal();
15153   Type *T = GV->getValueType();
15154   if (!T->isSized() ||
15155       Offset > GV->getParent()->getDataLayout().getTypeAllocSize(T))
15156     return SDValue();
15157 
15158   SDLoc DL(GN);
15159   SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
15160   return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
15161                      DAG.getConstant(MinOffset, DL, MVT::i64));
15162 }
15163 
15164 // Turns the vector of indices into a vector of byte offstes by scaling Offset
15165 // by (BitWidth / 8).
15166 static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset,
15167                                           SDLoc DL, unsigned BitWidth) {
15168   assert(Offset.getValueType().isScalableVector() &&
15169          "This method is only for scalable vectors of offsets");
15170 
15171   SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
15172   SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
15173 
15174   return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
15175 }
15176 
15177 /// Check if the value of \p OffsetInBytes can be used as an immediate for
15178 /// the gather load/prefetch and scatter store instructions with vector base and
15179 /// immediate offset addressing mode:
15180 ///
15181 ///      [<Zn>.[S|D]{, #<imm>}]
15182 ///
15183 /// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
15184 
15185 inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
15186                                                   unsigned ScalarSizeInBytes) {
15187   // The immediate is not a multiple of the scalar size.
15188   if (OffsetInBytes % ScalarSizeInBytes)
15189     return false;
15190 
15191   // The immediate is out of range.
15192   if (OffsetInBytes / ScalarSizeInBytes > 31)
15193     return false;
15194 
15195   return true;
15196 }
15197 
15198 /// Check if the value of \p Offset represents a valid immediate for the SVE
15199 /// gather load/prefetch and scatter store instructiona with vector base and
15200 /// immediate offset addressing mode:
15201 ///
15202 ///      [<Zn>.[S|D]{, #<imm>}]
15203 ///
15204 /// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
15205 static bool isValidImmForSVEVecImmAddrMode(SDValue Offset,
15206                                            unsigned ScalarSizeInBytes) {
15207   ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
15208   return OffsetConst && isValidImmForSVEVecImmAddrMode(
15209                             OffsetConst->getZExtValue(), ScalarSizeInBytes);
15210 }
15211 
15212 static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
15213                                           unsigned Opcode,
15214                                           bool OnlyPackedOffsets = true) {
15215   const SDValue Src = N->getOperand(2);
15216   const EVT SrcVT = Src->getValueType(0);
15217   assert(SrcVT.isScalableVector() &&
15218          "Scatter stores are only possible for SVE vectors");
15219 
15220   SDLoc DL(N);
15221   MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
15222 
15223   // Make sure that source data will fit into an SVE register
15224   if (SrcVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
15225     return SDValue();
15226 
15227   // For FPs, ACLE only supports _packed_ single and double precision types.
15228   if (SrcElVT.isFloatingPoint())
15229     if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64))
15230       return SDValue();
15231 
15232   // Depending on the addressing mode, this is either a pointer or a vector of
15233   // pointers (that fits into one register)
15234   SDValue Base = N->getOperand(4);
15235   // Depending on the addressing mode, this is either a single offset or a
15236   // vector of offsets  (that fits into one register)
15237   SDValue Offset = N->getOperand(5);
15238 
15239   // For "scalar + vector of indices", just scale the indices. This only
15240   // applies to non-temporal scatters because there's no instruction that takes
15241   // indicies.
15242   if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
15243     Offset =
15244         getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
15245     Opcode = AArch64ISD::SSTNT1_PRED;
15246   }
15247 
15248   // In the case of non-temporal gather loads there's only one SVE instruction
15249   // per data-size: "scalar + vector", i.e.
15250   //    * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
15251   // Since we do have intrinsics that allow the arguments to be in a different
15252   // order, we may need to swap them to match the spec.
15253   if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector())
15254     std::swap(Base, Offset);
15255 
15256   // SST1_IMM requires that the offset is an immediate that is:
15257   //    * a multiple of #SizeInBytes,
15258   //    * in the range [0, 31 x #SizeInBytes],
15259   // where #SizeInBytes is the size in bytes of the stored items. For
15260   // immediates outside that range and non-immediate scalar offsets use SST1 or
15261   // SST1_UXTW instead.
15262   if (Opcode == AArch64ISD::SST1_IMM_PRED) {
15263     if (!isValidImmForSVEVecImmAddrMode(Offset,
15264                                         SrcVT.getScalarSizeInBits() / 8)) {
15265       if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
15266         Opcode = AArch64ISD::SST1_UXTW_PRED;
15267       else
15268         Opcode = AArch64ISD::SST1_PRED;
15269 
15270       std::swap(Base, Offset);
15271     }
15272   }
15273 
15274   auto &TLI = DAG.getTargetLoweringInfo();
15275   if (!TLI.isTypeLegal(Base.getValueType()))
15276     return SDValue();
15277 
15278   // Some scatter store variants allow unpacked offsets, but only as nxv2i32
15279   // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
15280   // nxv2i64. Legalize accordingly.
15281   if (!OnlyPackedOffsets &&
15282       Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
15283     Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
15284 
15285   if (!TLI.isTypeLegal(Offset.getValueType()))
15286     return SDValue();
15287 
15288   // Source value type that is representable in hardware
15289   EVT HwSrcVt = getSVEContainerType(SrcVT);
15290 
15291   // Keep the original type of the input data to store - this is needed to be
15292   // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
15293   // FP values we want the integer equivalent, so just use HwSrcVt.
15294   SDValue InputVT = DAG.getValueType(SrcVT);
15295   if (SrcVT.isFloatingPoint())
15296     InputVT = DAG.getValueType(HwSrcVt);
15297 
15298   SDVTList VTs = DAG.getVTList(MVT::Other);
15299   SDValue SrcNew;
15300 
15301   if (Src.getValueType().isFloatingPoint())
15302     SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
15303   else
15304     SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
15305 
15306   SDValue Ops[] = {N->getOperand(0), // Chain
15307                    SrcNew,
15308                    N->getOperand(3), // Pg
15309                    Base,
15310                    Offset,
15311                    InputVT};
15312 
15313   return DAG.getNode(Opcode, DL, VTs, Ops);
15314 }
15315 
15316 static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
15317                                         unsigned Opcode,
15318                                         bool OnlyPackedOffsets = true) {
15319   const EVT RetVT = N->getValueType(0);
15320   assert(RetVT.isScalableVector() &&
15321          "Gather loads are only possible for SVE vectors");
15322 
15323   SDLoc DL(N);
15324 
15325   // Make sure that the loaded data will fit into an SVE register
15326   if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
15327     return SDValue();
15328 
15329   // Depending on the addressing mode, this is either a pointer or a vector of
15330   // pointers (that fits into one register)
15331   SDValue Base = N->getOperand(3);
15332   // Depending on the addressing mode, this is either a single offset or a
15333   // vector of offsets  (that fits into one register)
15334   SDValue Offset = N->getOperand(4);
15335 
15336   // For "scalar + vector of indices", just scale the indices. This only
15337   // applies to non-temporal gathers because there's no instruction that takes
15338   // indicies.
15339   if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
15340     Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
15341                                         RetVT.getScalarSizeInBits());
15342     Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
15343   }
15344 
15345   // In the case of non-temporal gather loads there's only one SVE instruction
15346   // per data-size: "scalar + vector", i.e.
15347   //    * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
15348   // Since we do have intrinsics that allow the arguments to be in a different
15349   // order, we may need to swap them to match the spec.
15350   if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO &&
15351       Offset.getValueType().isVector())
15352     std::swap(Base, Offset);
15353 
15354   // GLD{FF}1_IMM requires that the offset is an immediate that is:
15355   //    * a multiple of #SizeInBytes,
15356   //    * in the range [0, 31 x #SizeInBytes],
15357   // where #SizeInBytes is the size in bytes of the loaded items. For
15358   // immediates outside that range and non-immediate scalar offsets use
15359   // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
15360   if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
15361       Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
15362     if (!isValidImmForSVEVecImmAddrMode(Offset,
15363                                         RetVT.getScalarSizeInBits() / 8)) {
15364       if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
15365         Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
15366                      ? AArch64ISD::GLD1_UXTW_MERGE_ZERO
15367                      : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
15368       else
15369         Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
15370                      ? AArch64ISD::GLD1_MERGE_ZERO
15371                      : AArch64ISD::GLDFF1_MERGE_ZERO;
15372 
15373       std::swap(Base, Offset);
15374     }
15375   }
15376 
15377   auto &TLI = DAG.getTargetLoweringInfo();
15378   if (!TLI.isTypeLegal(Base.getValueType()))
15379     return SDValue();
15380 
15381   // Some gather load variants allow unpacked offsets, but only as nxv2i32
15382   // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
15383   // nxv2i64. Legalize accordingly.
15384   if (!OnlyPackedOffsets &&
15385       Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
15386     Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
15387 
15388   // Return value type that is representable in hardware
15389   EVT HwRetVt = getSVEContainerType(RetVT);
15390 
15391   // Keep the original output value type around - this is needed to be able to
15392   // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
15393   // values we want the integer equivalent, so just use HwRetVT.
15394   SDValue OutVT = DAG.getValueType(RetVT);
15395   if (RetVT.isFloatingPoint())
15396     OutVT = DAG.getValueType(HwRetVt);
15397 
15398   SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
15399   SDValue Ops[] = {N->getOperand(0), // Chain
15400                    N->getOperand(2), // Pg
15401                    Base, Offset, OutVT};
15402 
15403   SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
15404   SDValue LoadChain = SDValue(Load.getNode(), 1);
15405 
15406   if (RetVT.isInteger() && (RetVT != HwRetVt))
15407     Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
15408 
15409   // If the original return value was FP, bitcast accordingly. Doing it here
15410   // means that we can avoid adding TableGen patterns for FPs.
15411   if (RetVT.isFloatingPoint())
15412     Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
15413 
15414   return DAG.getMergeValues({Load, LoadChain}, DL);
15415 }
15416 
15417 static SDValue
15418 performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
15419                               SelectionDAG &DAG) {
15420   SDLoc DL(N);
15421   SDValue Src = N->getOperand(0);
15422   unsigned Opc = Src->getOpcode();
15423 
15424   // Sign extend of an unsigned unpack -> signed unpack
15425   if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
15426 
15427     unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
15428                                                : AArch64ISD::SUNPKLO;
15429 
15430     // Push the sign extend to the operand of the unpack
15431     // This is necessary where, for example, the operand of the unpack
15432     // is another unpack:
15433     // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
15434     // ->
15435     // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
15436     // ->
15437     // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
15438     SDValue ExtOp = Src->getOperand(0);
15439     auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
15440     EVT EltTy = VT.getVectorElementType();
15441     (void)EltTy;
15442 
15443     assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
15444            "Sign extending from an invalid type");
15445 
15446     EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
15447 
15448     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(),
15449                               ExtOp, DAG.getValueType(ExtVT));
15450 
15451     return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
15452   }
15453 
15454   if (DCI.isBeforeLegalizeOps())
15455     return SDValue();
15456 
15457   if (!EnableCombineMGatherIntrinsics)
15458     return SDValue();
15459 
15460   // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
15461   // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
15462   unsigned NewOpc;
15463   unsigned MemVTOpNum = 4;
15464   switch (Opc) {
15465   case AArch64ISD::LD1_MERGE_ZERO:
15466     NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
15467     MemVTOpNum = 3;
15468     break;
15469   case AArch64ISD::LDNF1_MERGE_ZERO:
15470     NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
15471     MemVTOpNum = 3;
15472     break;
15473   case AArch64ISD::LDFF1_MERGE_ZERO:
15474     NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
15475     MemVTOpNum = 3;
15476     break;
15477   case AArch64ISD::GLD1_MERGE_ZERO:
15478     NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
15479     break;
15480   case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
15481     NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
15482     break;
15483   case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
15484     NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
15485     break;
15486   case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
15487     NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
15488     break;
15489   case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
15490     NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
15491     break;
15492   case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
15493     NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
15494     break;
15495   case AArch64ISD::GLD1_IMM_MERGE_ZERO:
15496     NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
15497     break;
15498   case AArch64ISD::GLDFF1_MERGE_ZERO:
15499     NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
15500     break;
15501   case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
15502     NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
15503     break;
15504   case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
15505     NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
15506     break;
15507   case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
15508     NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
15509     break;
15510   case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
15511     NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
15512     break;
15513   case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
15514     NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
15515     break;
15516   case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
15517     NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
15518     break;
15519   case AArch64ISD::GLDNT1_MERGE_ZERO:
15520     NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
15521     break;
15522   default:
15523     return SDValue();
15524   }
15525 
15526   EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
15527   EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
15528 
15529   if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
15530     return SDValue();
15531 
15532   EVT DstVT = N->getValueType(0);
15533   SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
15534 
15535   SmallVector<SDValue, 5> Ops;
15536   for (unsigned I = 0; I < Src->getNumOperands(); ++I)
15537     Ops.push_back(Src->getOperand(I));
15538 
15539   SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
15540   DCI.CombineTo(N, ExtLoad);
15541   DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
15542 
15543   // Return N so it doesn't get rechecked
15544   return SDValue(N, 0);
15545 }
15546 
15547 /// Legalize the gather prefetch (scalar + vector addressing mode) when the
15548 /// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
15549 /// != nxv2i32) do not need legalization.
15550 static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) {
15551   const unsigned OffsetPos = 4;
15552   SDValue Offset = N->getOperand(OffsetPos);
15553 
15554   // Not an unpacked vector, bail out.
15555   if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
15556     return SDValue();
15557 
15558   // Extend the unpacked offset vector to 64-bit lanes.
15559   SDLoc DL(N);
15560   Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
15561   SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
15562   // Replace the offset operand with the 64-bit one.
15563   Ops[OffsetPos] = Offset;
15564 
15565   return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
15566 }
15567 
15568 /// Combines a node carrying the intrinsic
15569 /// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
15570 /// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
15571 /// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
15572 /// sve gather prefetch instruction with vector plus immediate addressing mode.
15573 static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG,
15574                                                unsigned ScalarSizeInBytes) {
15575   const unsigned ImmPos = 4, OffsetPos = 3;
15576   // No need to combine the node if the immediate is valid...
15577   if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
15578     return SDValue();
15579 
15580   // ...otherwise swap the offset base with the offset...
15581   SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
15582   std::swap(Ops[ImmPos], Ops[OffsetPos]);
15583   // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
15584   // `aarch64_sve_prfb_gather_uxtw_index`.
15585   SDLoc DL(N);
15586   Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
15587                            MVT::i64);
15588 
15589   return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
15590 }
15591 
15592 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
15593                                                  DAGCombinerInfo &DCI) const {
15594   SelectionDAG &DAG = DCI.DAG;
15595   switch (N->getOpcode()) {
15596   default:
15597     LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
15598     break;
15599   case ISD::ABS:
15600     return performABSCombine(N, DAG, DCI, Subtarget);
15601   case ISD::ADD:
15602   case ISD::SUB:
15603     return performAddSubCombine(N, DCI, DAG);
15604   case ISD::XOR:
15605     return performXorCombine(N, DAG, DCI, Subtarget);
15606   case ISD::MUL:
15607     return performMulCombine(N, DAG, DCI, Subtarget);
15608   case ISD::SINT_TO_FP:
15609   case ISD::UINT_TO_FP:
15610     return performIntToFpCombine(N, DAG, Subtarget);
15611   case ISD::FP_TO_SINT:
15612   case ISD::FP_TO_UINT:
15613     return performFpToIntCombine(N, DAG, DCI, Subtarget);
15614   case ISD::FDIV:
15615     return performFDivCombine(N, DAG, DCI, Subtarget);
15616   case ISD::OR:
15617     return performORCombine(N, DCI, Subtarget);
15618   case ISD::AND:
15619     return performANDCombine(N, DCI);
15620   case ISD::SRL:
15621     return performSRLCombine(N, DCI);
15622   case ISD::INTRINSIC_WO_CHAIN:
15623     return performIntrinsicCombine(N, DCI, Subtarget);
15624   case ISD::ANY_EXTEND:
15625   case ISD::ZERO_EXTEND:
15626   case ISD::SIGN_EXTEND:
15627     return performExtendCombine(N, DCI, DAG);
15628   case ISD::SIGN_EXTEND_INREG:
15629     return performSignExtendInRegCombine(N, DCI, DAG);
15630   case ISD::TRUNCATE:
15631     return performVectorTruncateCombine(N, DCI, DAG);
15632   case ISD::CONCAT_VECTORS:
15633     return performConcatVectorsCombine(N, DCI, DAG);
15634   case ISD::SELECT:
15635     return performSelectCombine(N, DCI);
15636   case ISD::VSELECT:
15637     return performVSelectCombine(N, DCI.DAG);
15638   case ISD::LOAD:
15639     if (performTBISimplification(N->getOperand(1), DCI, DAG))
15640       return SDValue(N, 0);
15641     break;
15642   case ISD::STORE:
15643     return performSTORECombine(N, DCI, DAG, Subtarget);
15644   case ISD::MGATHER:
15645   case ISD::MSCATTER:
15646     return performMaskedGatherScatterCombine(N, DCI, DAG);
15647   case AArch64ISD::BRCOND:
15648     return performBRCONDCombine(N, DCI, DAG);
15649   case AArch64ISD::TBNZ:
15650   case AArch64ISD::TBZ:
15651     return performTBZCombine(N, DCI, DAG);
15652   case AArch64ISD::CSEL:
15653     return performCONDCombine(N, DCI, DAG, 2, 3);
15654   case AArch64ISD::DUP:
15655     return performPostLD1Combine(N, DCI, false);
15656   case AArch64ISD::NVCAST:
15657     return performNVCASTCombine(N);
15658   case AArch64ISD::UZP1:
15659     return performUzpCombine(N, DAG);
15660   case ISD::INSERT_VECTOR_ELT:
15661     return performPostLD1Combine(N, DCI, true);
15662   case ISD::EXTRACT_VECTOR_ELT:
15663     return performExtractVectorEltCombine(N, DAG);
15664   case ISD::VECREDUCE_ADD:
15665     return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
15666   case ISD::INTRINSIC_VOID:
15667   case ISD::INTRINSIC_W_CHAIN:
15668     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
15669     case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
15670       return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
15671     case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
15672       return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
15673     case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
15674       return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
15675     case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
15676       return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
15677     case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
15678     case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
15679     case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
15680     case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
15681     case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
15682     case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
15683     case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
15684     case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
15685       return legalizeSVEGatherPrefetchOffsVec(N, DAG);
15686     case Intrinsic::aarch64_neon_ld2:
15687     case Intrinsic::aarch64_neon_ld3:
15688     case Intrinsic::aarch64_neon_ld4:
15689     case Intrinsic::aarch64_neon_ld1x2:
15690     case Intrinsic::aarch64_neon_ld1x3:
15691     case Intrinsic::aarch64_neon_ld1x4:
15692     case Intrinsic::aarch64_neon_ld2lane:
15693     case Intrinsic::aarch64_neon_ld3lane:
15694     case Intrinsic::aarch64_neon_ld4lane:
15695     case Intrinsic::aarch64_neon_ld2r:
15696     case Intrinsic::aarch64_neon_ld3r:
15697     case Intrinsic::aarch64_neon_ld4r:
15698     case Intrinsic::aarch64_neon_st2:
15699     case Intrinsic::aarch64_neon_st3:
15700     case Intrinsic::aarch64_neon_st4:
15701     case Intrinsic::aarch64_neon_st1x2:
15702     case Intrinsic::aarch64_neon_st1x3:
15703     case Intrinsic::aarch64_neon_st1x4:
15704     case Intrinsic::aarch64_neon_st2lane:
15705     case Intrinsic::aarch64_neon_st3lane:
15706     case Intrinsic::aarch64_neon_st4lane:
15707       return performNEONPostLDSTCombine(N, DCI, DAG);
15708     case Intrinsic::aarch64_sve_ldnt1:
15709       return performLDNT1Combine(N, DAG);
15710     case Intrinsic::aarch64_sve_ld1rq:
15711       return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
15712     case Intrinsic::aarch64_sve_ld1ro:
15713       return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
15714     case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
15715       return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
15716     case Intrinsic::aarch64_sve_ldnt1_gather:
15717       return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
15718     case Intrinsic::aarch64_sve_ldnt1_gather_index:
15719       return performGatherLoadCombine(N, DAG,
15720                                       AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
15721     case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
15722       return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
15723     case Intrinsic::aarch64_sve_ld1:
15724       return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO);
15725     case Intrinsic::aarch64_sve_ldnf1:
15726       return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO);
15727     case Intrinsic::aarch64_sve_ldff1:
15728       return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO);
15729     case Intrinsic::aarch64_sve_st1:
15730       return performST1Combine(N, DAG);
15731     case Intrinsic::aarch64_sve_stnt1:
15732       return performSTNT1Combine(N, DAG);
15733     case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
15734       return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
15735     case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
15736       return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
15737     case Intrinsic::aarch64_sve_stnt1_scatter:
15738       return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
15739     case Intrinsic::aarch64_sve_stnt1_scatter_index:
15740       return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED);
15741     case Intrinsic::aarch64_sve_ld1_gather:
15742       return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
15743     case Intrinsic::aarch64_sve_ld1_gather_index:
15744       return performGatherLoadCombine(N, DAG,
15745                                       AArch64ISD::GLD1_SCALED_MERGE_ZERO);
15746     case Intrinsic::aarch64_sve_ld1_gather_sxtw:
15747       return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO,
15748                                       /*OnlyPackedOffsets=*/false);
15749     case Intrinsic::aarch64_sve_ld1_gather_uxtw:
15750       return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO,
15751                                       /*OnlyPackedOffsets=*/false);
15752     case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
15753       return performGatherLoadCombine(N, DAG,
15754                                       AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
15755                                       /*OnlyPackedOffsets=*/false);
15756     case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
15757       return performGatherLoadCombine(N, DAG,
15758                                       AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
15759                                       /*OnlyPackedOffsets=*/false);
15760     case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
15761       return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO);
15762     case Intrinsic::aarch64_sve_ldff1_gather:
15763       return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO);
15764     case Intrinsic::aarch64_sve_ldff1_gather_index:
15765       return performGatherLoadCombine(N, DAG,
15766                                       AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
15767     case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
15768       return performGatherLoadCombine(N, DAG,
15769                                       AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
15770                                       /*OnlyPackedOffsets=*/false);
15771     case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
15772       return performGatherLoadCombine(N, DAG,
15773                                       AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
15774                                       /*OnlyPackedOffsets=*/false);
15775     case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
15776       return performGatherLoadCombine(N, DAG,
15777                                       AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
15778                                       /*OnlyPackedOffsets=*/false);
15779     case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
15780       return performGatherLoadCombine(N, DAG,
15781                                       AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
15782                                       /*OnlyPackedOffsets=*/false);
15783     case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
15784       return performGatherLoadCombine(N, DAG,
15785                                       AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
15786     case Intrinsic::aarch64_sve_st1_scatter:
15787       return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
15788     case Intrinsic::aarch64_sve_st1_scatter_index:
15789       return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED);
15790     case Intrinsic::aarch64_sve_st1_scatter_sxtw:
15791       return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED,
15792                                         /*OnlyPackedOffsets=*/false);
15793     case Intrinsic::aarch64_sve_st1_scatter_uxtw:
15794       return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED,
15795                                         /*OnlyPackedOffsets=*/false);
15796     case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
15797       return performScatterStoreCombine(N, DAG,
15798                                         AArch64ISD::SST1_SXTW_SCALED_PRED,
15799                                         /*OnlyPackedOffsets=*/false);
15800     case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
15801       return performScatterStoreCombine(N, DAG,
15802                                         AArch64ISD::SST1_UXTW_SCALED_PRED,
15803                                         /*OnlyPackedOffsets=*/false);
15804     case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
15805       return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
15806     case Intrinsic::aarch64_sve_tuple_get: {
15807       SDLoc DL(N);
15808       SDValue Chain = N->getOperand(0);
15809       SDValue Src1 = N->getOperand(2);
15810       SDValue Idx = N->getOperand(3);
15811 
15812       uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
15813       EVT ResVT = N->getValueType(0);
15814       uint64_t NumLanes = ResVT.getVectorElementCount().getKnownMinValue();
15815       SDValue ExtIdx = DAG.getVectorIdxConstant(IdxConst * NumLanes, DL);
15816       SDValue Val =
15817           DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Src1, ExtIdx);
15818       return DAG.getMergeValues({Val, Chain}, DL);
15819     }
15820     case Intrinsic::aarch64_sve_tuple_set: {
15821       SDLoc DL(N);
15822       SDValue Chain = N->getOperand(0);
15823       SDValue Tuple = N->getOperand(2);
15824       SDValue Idx = N->getOperand(3);
15825       SDValue Vec = N->getOperand(4);
15826 
15827       EVT TupleVT = Tuple.getValueType();
15828       uint64_t TupleLanes = TupleVT.getVectorElementCount().getKnownMinValue();
15829 
15830       uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
15831       uint64_t NumLanes =
15832           Vec.getValueType().getVectorElementCount().getKnownMinValue();
15833 
15834       if ((TupleLanes % NumLanes) != 0)
15835         report_fatal_error("invalid tuple vector!");
15836 
15837       uint64_t NumVecs = TupleLanes / NumLanes;
15838 
15839       SmallVector<SDValue, 4> Opnds;
15840       for (unsigned I = 0; I < NumVecs; ++I) {
15841         if (I == IdxConst)
15842           Opnds.push_back(Vec);
15843         else {
15844           SDValue ExtIdx = DAG.getVectorIdxConstant(I * NumLanes, DL);
15845           Opnds.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL,
15846                                       Vec.getValueType(), Tuple, ExtIdx));
15847         }
15848       }
15849       SDValue Concat =
15850           DAG.getNode(ISD::CONCAT_VECTORS, DL, Tuple.getValueType(), Opnds);
15851       return DAG.getMergeValues({Concat, Chain}, DL);
15852     }
15853     case Intrinsic::aarch64_sve_tuple_create2:
15854     case Intrinsic::aarch64_sve_tuple_create3:
15855     case Intrinsic::aarch64_sve_tuple_create4: {
15856       SDLoc DL(N);
15857       SDValue Chain = N->getOperand(0);
15858 
15859       SmallVector<SDValue, 4> Opnds;
15860       for (unsigned I = 2; I < N->getNumOperands(); ++I)
15861         Opnds.push_back(N->getOperand(I));
15862 
15863       EVT VT = Opnds[0].getValueType();
15864       EVT EltVT = VT.getVectorElementType();
15865       EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
15866                                     VT.getVectorElementCount() *
15867                                         (N->getNumOperands() - 2));
15868       SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Opnds);
15869       return DAG.getMergeValues({Concat, Chain}, DL);
15870     }
15871     case Intrinsic::aarch64_sve_ld2:
15872     case Intrinsic::aarch64_sve_ld3:
15873     case Intrinsic::aarch64_sve_ld4: {
15874       SDLoc DL(N);
15875       SDValue Chain = N->getOperand(0);
15876       SDValue Mask = N->getOperand(2);
15877       SDValue BasePtr = N->getOperand(3);
15878       SDValue LoadOps[] = {Chain, Mask, BasePtr};
15879       unsigned IntrinsicID =
15880           cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
15881       SDValue Result =
15882           LowerSVEStructLoad(IntrinsicID, LoadOps, N->getValueType(0), DAG, DL);
15883       return DAG.getMergeValues({Result, Chain}, DL);
15884     }
15885     default:
15886       break;
15887     }
15888     break;
15889   case ISD::GlobalAddress:
15890     return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
15891   }
15892   return SDValue();
15893 }
15894 
15895 // Check if the return value is used as only a return value, as otherwise
15896 // we can't perform a tail-call. In particular, we need to check for
15897 // target ISD nodes that are returns and any other "odd" constructs
15898 // that the generic analysis code won't necessarily catch.
15899 bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
15900                                                SDValue &Chain) const {
15901   if (N->getNumValues() != 1)
15902     return false;
15903   if (!N->hasNUsesOfValue(1, 0))
15904     return false;
15905 
15906   SDValue TCChain = Chain;
15907   SDNode *Copy = *N->use_begin();
15908   if (Copy->getOpcode() == ISD::CopyToReg) {
15909     // If the copy has a glue operand, we conservatively assume it isn't safe to
15910     // perform a tail call.
15911     if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
15912         MVT::Glue)
15913       return false;
15914     TCChain = Copy->getOperand(0);
15915   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
15916     return false;
15917 
15918   bool HasRet = false;
15919   for (SDNode *Node : Copy->uses()) {
15920     if (Node->getOpcode() != AArch64ISD::RET_FLAG)
15921       return false;
15922     HasRet = true;
15923   }
15924 
15925   if (!HasRet)
15926     return false;
15927 
15928   Chain = TCChain;
15929   return true;
15930 }
15931 
15932 // Return whether the an instruction can potentially be optimized to a tail
15933 // call. This will cause the optimizers to attempt to move, or duplicate,
15934 // return instructions to help enable tail call optimizations for this
15935 // instruction.
15936 bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
15937   return CI->isTailCall();
15938 }
15939 
15940 bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
15941                                                    SDValue &Offset,
15942                                                    ISD::MemIndexedMode &AM,
15943                                                    bool &IsInc,
15944                                                    SelectionDAG &DAG) const {
15945   if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
15946     return false;
15947 
15948   Base = Op->getOperand(0);
15949   // All of the indexed addressing mode instructions take a signed
15950   // 9 bit immediate offset.
15951   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
15952     int64_t RHSC = RHS->getSExtValue();
15953     if (Op->getOpcode() == ISD::SUB)
15954       RHSC = -(uint64_t)RHSC;
15955     if (!isInt<9>(RHSC))
15956       return false;
15957     IsInc = (Op->getOpcode() == ISD::ADD);
15958     Offset = Op->getOperand(1);
15959     return true;
15960   }
15961   return false;
15962 }
15963 
15964 bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
15965                                                       SDValue &Offset,
15966                                                       ISD::MemIndexedMode &AM,
15967                                                       SelectionDAG &DAG) const {
15968   EVT VT;
15969   SDValue Ptr;
15970   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
15971     VT = LD->getMemoryVT();
15972     Ptr = LD->getBasePtr();
15973   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
15974     VT = ST->getMemoryVT();
15975     Ptr = ST->getBasePtr();
15976   } else
15977     return false;
15978 
15979   bool IsInc;
15980   if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
15981     return false;
15982   AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
15983   return true;
15984 }
15985 
15986 bool AArch64TargetLowering::getPostIndexedAddressParts(
15987     SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
15988     ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
15989   EVT VT;
15990   SDValue Ptr;
15991   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
15992     VT = LD->getMemoryVT();
15993     Ptr = LD->getBasePtr();
15994   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
15995     VT = ST->getMemoryVT();
15996     Ptr = ST->getBasePtr();
15997   } else
15998     return false;
15999 
16000   bool IsInc;
16001   if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
16002     return false;
16003   // Post-indexing updates the base, so it's not a valid transform
16004   // if that's not the same as the load's pointer.
16005   if (Ptr != Base)
16006     return false;
16007   AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
16008   return true;
16009 }
16010 
16011 static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
16012                                   SelectionDAG &DAG) {
16013   SDLoc DL(N);
16014   SDValue Op = N->getOperand(0);
16015 
16016   if (N->getValueType(0) != MVT::i16 ||
16017       (Op.getValueType() != MVT::f16 && Op.getValueType() != MVT::bf16))
16018     return;
16019 
16020   Op = SDValue(
16021       DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
16022                          DAG.getUNDEF(MVT::i32), Op,
16023                          DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
16024       0);
16025   Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
16026   Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
16027 }
16028 
16029 static void ReplaceReductionResults(SDNode *N,
16030                                     SmallVectorImpl<SDValue> &Results,
16031                                     SelectionDAG &DAG, unsigned InterOp,
16032                                     unsigned AcrossOp) {
16033   EVT LoVT, HiVT;
16034   SDValue Lo, Hi;
16035   SDLoc dl(N);
16036   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
16037   std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
16038   SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
16039   SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
16040   Results.push_back(SplitVal);
16041 }
16042 
16043 static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
16044   SDLoc DL(N);
16045   SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N);
16046   SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64,
16047                            DAG.getNode(ISD::SRL, DL, MVT::i128, N,
16048                                        DAG.getConstant(64, DL, MVT::i64)));
16049   return std::make_pair(Lo, Hi);
16050 }
16051 
16052 void AArch64TargetLowering::ReplaceExtractSubVectorResults(
16053     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
16054   SDValue In = N->getOperand(0);
16055   EVT InVT = In.getValueType();
16056 
16057   // Common code will handle these just fine.
16058   if (!InVT.isScalableVector() || !InVT.isInteger())
16059     return;
16060 
16061   SDLoc DL(N);
16062   EVT VT = N->getValueType(0);
16063 
16064   // The following checks bail if this is not a halving operation.
16065 
16066   ElementCount ResEC = VT.getVectorElementCount();
16067 
16068   if (InVT.getVectorElementCount() != (ResEC * 2))
16069     return;
16070 
16071   auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
16072   if (!CIndex)
16073     return;
16074 
16075   unsigned Index = CIndex->getZExtValue();
16076   if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
16077     return;
16078 
16079   unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
16080   EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
16081 
16082   SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
16083   Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
16084 }
16085 
16086 // Create an even/odd pair of X registers holding integer value V.
16087 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
16088   SDLoc dl(V.getNode());
16089   SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i64);
16090   SDValue VHi = DAG.getAnyExtOrTrunc(
16091       DAG.getNode(ISD::SRL, dl, MVT::i128, V, DAG.getConstant(64, dl, MVT::i64)),
16092       dl, MVT::i64);
16093   if (DAG.getDataLayout().isBigEndian())
16094     std::swap (VLo, VHi);
16095   SDValue RegClass =
16096       DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
16097   SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
16098   SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
16099   const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
16100   return SDValue(
16101       DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
16102 }
16103 
16104 static void ReplaceCMP_SWAP_128Results(SDNode *N,
16105                                        SmallVectorImpl<SDValue> &Results,
16106                                        SelectionDAG &DAG,
16107                                        const AArch64Subtarget *Subtarget) {
16108   assert(N->getValueType(0) == MVT::i128 &&
16109          "AtomicCmpSwap on types less than 128 should be legal");
16110 
16111   if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
16112     // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
16113     // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
16114     SDValue Ops[] = {
16115         createGPRPairNode(DAG, N->getOperand(2)), // Compare value
16116         createGPRPairNode(DAG, N->getOperand(3)), // Store value
16117         N->getOperand(1), // Ptr
16118         N->getOperand(0), // Chain in
16119     };
16120 
16121     MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
16122 
16123     unsigned Opcode;
16124     switch (MemOp->getOrdering()) {
16125     case AtomicOrdering::Monotonic:
16126       Opcode = AArch64::CASPX;
16127       break;
16128     case AtomicOrdering::Acquire:
16129       Opcode = AArch64::CASPAX;
16130       break;
16131     case AtomicOrdering::Release:
16132       Opcode = AArch64::CASPLX;
16133       break;
16134     case AtomicOrdering::AcquireRelease:
16135     case AtomicOrdering::SequentiallyConsistent:
16136       Opcode = AArch64::CASPALX;
16137       break;
16138     default:
16139       llvm_unreachable("Unexpected ordering!");
16140     }
16141 
16142     MachineSDNode *CmpSwap = DAG.getMachineNode(
16143         Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
16144     DAG.setNodeMemRefs(CmpSwap, {MemOp});
16145 
16146     unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
16147     if (DAG.getDataLayout().isBigEndian())
16148       std::swap(SubReg1, SubReg2);
16149     SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
16150                                             SDValue(CmpSwap, 0));
16151     SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
16152                                             SDValue(CmpSwap, 0));
16153     Results.push_back(
16154         DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
16155     Results.push_back(SDValue(CmpSwap, 1)); // Chain out
16156     return;
16157   }
16158 
16159   auto Desired = splitInt128(N->getOperand(2), DAG);
16160   auto New = splitInt128(N->getOperand(3), DAG);
16161   SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
16162                    New.first,        New.second,    N->getOperand(0)};
16163   SDNode *CmpSwap = DAG.getMachineNode(
16164       AArch64::CMP_SWAP_128, SDLoc(N),
16165       DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops);
16166 
16167   MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
16168   DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
16169 
16170   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
16171                                 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
16172   Results.push_back(SDValue(CmpSwap, 3));
16173 }
16174 
16175 void AArch64TargetLowering::ReplaceNodeResults(
16176     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
16177   switch (N->getOpcode()) {
16178   default:
16179     llvm_unreachable("Don't know how to custom expand this");
16180   case ISD::BITCAST:
16181     ReplaceBITCASTResults(N, Results, DAG);
16182     return;
16183   case ISD::VECREDUCE_ADD:
16184   case ISD::VECREDUCE_SMAX:
16185   case ISD::VECREDUCE_SMIN:
16186   case ISD::VECREDUCE_UMAX:
16187   case ISD::VECREDUCE_UMIN:
16188     Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
16189     return;
16190 
16191   case ISD::CTPOP:
16192     if (SDValue Result = LowerCTPOP(SDValue(N, 0), DAG))
16193       Results.push_back(Result);
16194     return;
16195   case AArch64ISD::SADDV:
16196     ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
16197     return;
16198   case AArch64ISD::UADDV:
16199     ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
16200     return;
16201   case AArch64ISD::SMINV:
16202     ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
16203     return;
16204   case AArch64ISD::UMINV:
16205     ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
16206     return;
16207   case AArch64ISD::SMAXV:
16208     ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
16209     return;
16210   case AArch64ISD::UMAXV:
16211     ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
16212     return;
16213   case ISD::FP_TO_UINT:
16214   case ISD::FP_TO_SINT:
16215     assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
16216     // Let normal code take care of it by not adding anything to Results.
16217     return;
16218   case ISD::ATOMIC_CMP_SWAP:
16219     ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
16220     return;
16221   case ISD::LOAD: {
16222     assert(SDValue(N, 0).getValueType() == MVT::i128 &&
16223            "unexpected load's value type");
16224     LoadSDNode *LoadNode = cast<LoadSDNode>(N);
16225     if (!LoadNode->isVolatile() || LoadNode->getMemoryVT() != MVT::i128) {
16226       // Non-volatile loads are optimized later in AArch64's load/store
16227       // optimizer.
16228       return;
16229     }
16230 
16231     SDValue Result = DAG.getMemIntrinsicNode(
16232         AArch64ISD::LDP, SDLoc(N),
16233         DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
16234         {LoadNode->getChain(), LoadNode->getBasePtr()}, LoadNode->getMemoryVT(),
16235         LoadNode->getMemOperand());
16236 
16237     SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
16238                                Result.getValue(0), Result.getValue(1));
16239     Results.append({Pair, Result.getValue(2) /* Chain */});
16240     return;
16241   }
16242   case ISD::EXTRACT_SUBVECTOR:
16243     ReplaceExtractSubVectorResults(N, Results, DAG);
16244     return;
16245   case ISD::INTRINSIC_WO_CHAIN: {
16246     EVT VT = N->getValueType(0);
16247     assert((VT == MVT::i8 || VT == MVT::i16) &&
16248            "custom lowering for unexpected type");
16249 
16250     ConstantSDNode *CN = cast<ConstantSDNode>(N->getOperand(0));
16251     Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
16252     switch (IntID) {
16253     default:
16254       return;
16255     case Intrinsic::aarch64_sve_clasta_n: {
16256       SDLoc DL(N);
16257       auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
16258       auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
16259                            N->getOperand(1), Op2, N->getOperand(3));
16260       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
16261       return;
16262     }
16263     case Intrinsic::aarch64_sve_clastb_n: {
16264       SDLoc DL(N);
16265       auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
16266       auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
16267                            N->getOperand(1), Op2, N->getOperand(3));
16268       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
16269       return;
16270     }
16271     case Intrinsic::aarch64_sve_lasta: {
16272       SDLoc DL(N);
16273       auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
16274                            N->getOperand(1), N->getOperand(2));
16275       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
16276       return;
16277     }
16278     case Intrinsic::aarch64_sve_lastb: {
16279       SDLoc DL(N);
16280       auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
16281                            N->getOperand(1), N->getOperand(2));
16282       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
16283       return;
16284     }
16285     }
16286   }
16287   }
16288 }
16289 
16290 bool AArch64TargetLowering::useLoadStackGuardNode() const {
16291   if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
16292     return TargetLowering::useLoadStackGuardNode();
16293   return true;
16294 }
16295 
16296 unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
16297   // Combine multiple FDIVs with the same divisor into multiple FMULs by the
16298   // reciprocal if there are three or more FDIVs.
16299   return 3;
16300 }
16301 
16302 TargetLoweringBase::LegalizeTypeAction
16303 AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {
16304   // During type legalization, we prefer to widen v1i8, v1i16, v1i32  to v8i8,
16305   // v4i16, v2i32 instead of to promote.
16306   if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
16307       VT == MVT::v1f32)
16308     return TypeWidenVector;
16309 
16310   return TargetLoweringBase::getPreferredVectorAction(VT);
16311 }
16312 
16313 // Loads and stores less than 128-bits are already atomic; ones above that
16314 // are doomed anyway, so defer to the default libcall and blame the OS when
16315 // things go wrong.
16316 bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
16317   unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
16318   return Size == 128;
16319 }
16320 
16321 // Loads and stores less than 128-bits are already atomic; ones above that
16322 // are doomed anyway, so defer to the default libcall and blame the OS when
16323 // things go wrong.
16324 TargetLowering::AtomicExpansionKind
16325 AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
16326   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
16327   return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
16328 }
16329 
16330 // For the real atomic operations, we have ldxr/stxr up to 128 bits,
16331 TargetLowering::AtomicExpansionKind
16332 AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
16333   if (AI->isFloatingPointOperation())
16334     return AtomicExpansionKind::CmpXChg;
16335 
16336   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
16337   if (Size > 128) return AtomicExpansionKind::None;
16338 
16339   // Nand is not supported in LSE.
16340   // Leave 128 bits to LLSC or CmpXChg.
16341   if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
16342     if (Subtarget->hasLSE())
16343       return AtomicExpansionKind::None;
16344     if (Subtarget->outlineAtomics()) {
16345       // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
16346       // Don't outline them unless
16347       // (1) high level <atomic> support approved:
16348       //   http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
16349       // (2) low level libgcc and compiler-rt support implemented by:
16350       //   min/max outline atomics helpers
16351       if (AI->getOperation() != AtomicRMWInst::Min &&
16352           AI->getOperation() != AtomicRMWInst::Max &&
16353           AI->getOperation() != AtomicRMWInst::UMin &&
16354           AI->getOperation() != AtomicRMWInst::UMax) {
16355         return AtomicExpansionKind::None;
16356       }
16357     }
16358   }
16359 
16360   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
16361   // implement atomicrmw without spilling. If the target address is also on the
16362   // stack and close enough to the spill slot, this can lead to a situation
16363   // where the monitor always gets cleared and the atomic operation can never
16364   // succeed. So at -O0 lower this operation to a CAS loop.
16365   if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
16366     return AtomicExpansionKind::CmpXChg;
16367 
16368   return AtomicExpansionKind::LLSC;
16369 }
16370 
16371 TargetLowering::AtomicExpansionKind
16372 AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
16373     AtomicCmpXchgInst *AI) const {
16374   // If subtarget has LSE, leave cmpxchg intact for codegen.
16375   if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
16376     return AtomicExpansionKind::None;
16377   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
16378   // implement cmpxchg without spilling. If the address being exchanged is also
16379   // on the stack and close enough to the spill slot, this can lead to a
16380   // situation where the monitor always gets cleared and the atomic operation
16381   // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
16382   if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
16383     return AtomicExpansionKind::None;
16384   return AtomicExpansionKind::LLSC;
16385 }
16386 
16387 Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
16388                                              AtomicOrdering Ord) const {
16389   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
16390   Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
16391   bool IsAcquire = isAcquireOrStronger(Ord);
16392 
16393   // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
16394   // intrinsic must return {i64, i64} and we have to recombine them into a
16395   // single i128 here.
16396   if (ValTy->getPrimitiveSizeInBits() == 128) {
16397     Intrinsic::ID Int =
16398         IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
16399     Function *Ldxr = Intrinsic::getDeclaration(M, Int);
16400 
16401     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
16402     Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
16403 
16404     Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
16405     Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
16406     Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
16407     Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
16408     return Builder.CreateOr(
16409         Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
16410   }
16411 
16412   Type *Tys[] = { Addr->getType() };
16413   Intrinsic::ID Int =
16414       IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
16415   Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
16416 
16417   Type *EltTy = cast<PointerType>(Addr->getType())->getElementType();
16418 
16419   const DataLayout &DL = M->getDataLayout();
16420   IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(EltTy));
16421   Value *Trunc = Builder.CreateTrunc(Builder.CreateCall(Ldxr, Addr), IntEltTy);
16422 
16423   return Builder.CreateBitCast(Trunc, EltTy);
16424 }
16425 
16426 void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
16427     IRBuilder<> &Builder) const {
16428   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
16429   Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
16430 }
16431 
16432 Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
16433                                                    Value *Val, Value *Addr,
16434                                                    AtomicOrdering Ord) const {
16435   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
16436   bool IsRelease = isReleaseOrStronger(Ord);
16437 
16438   // Since the intrinsics must have legal type, the i128 intrinsics take two
16439   // parameters: "i64, i64". We must marshal Val into the appropriate form
16440   // before the call.
16441   if (Val->getType()->getPrimitiveSizeInBits() == 128) {
16442     Intrinsic::ID Int =
16443         IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
16444     Function *Stxr = Intrinsic::getDeclaration(M, Int);
16445     Type *Int64Ty = Type::getInt64Ty(M->getContext());
16446 
16447     Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
16448     Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
16449     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
16450     return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
16451   }
16452 
16453   Intrinsic::ID Int =
16454       IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
16455   Type *Tys[] = { Addr->getType() };
16456   Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
16457 
16458   const DataLayout &DL = M->getDataLayout();
16459   IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
16460   Val = Builder.CreateBitCast(Val, IntValTy);
16461 
16462   return Builder.CreateCall(Stxr,
16463                             {Builder.CreateZExtOrBitCast(
16464                                  Val, Stxr->getFunctionType()->getParamType(0)),
16465                              Addr});
16466 }
16467 
16468 bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
16469     Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
16470   if (Ty->isArrayTy())
16471     return true;
16472 
16473   const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
16474   if (TySize.isScalable() && TySize.getKnownMinSize() > 128)
16475     return true;
16476 
16477   return false;
16478 }
16479 
16480 bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
16481                                                             EVT) const {
16482   return false;
16483 }
16484 
16485 static Value *UseTlsOffset(IRBuilder<> &IRB, unsigned Offset) {
16486   Module *M = IRB.GetInsertBlock()->getParent()->getParent();
16487   Function *ThreadPointerFunc =
16488       Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
16489   return IRB.CreatePointerCast(
16490       IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
16491                              Offset),
16492       IRB.getInt8PtrTy()->getPointerTo(0));
16493 }
16494 
16495 Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
16496   // Android provides a fixed TLS slot for the stack cookie. See the definition
16497   // of TLS_SLOT_STACK_GUARD in
16498   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
16499   if (Subtarget->isTargetAndroid())
16500     return UseTlsOffset(IRB, 0x28);
16501 
16502   // Fuchsia is similar.
16503   // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
16504   if (Subtarget->isTargetFuchsia())
16505     return UseTlsOffset(IRB, -0x10);
16506 
16507   return TargetLowering::getIRStackGuard(IRB);
16508 }
16509 
16510 void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
16511   // MSVC CRT provides functionalities for stack protection.
16512   if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
16513     // MSVC CRT has a global variable holding security cookie.
16514     M.getOrInsertGlobal("__security_cookie",
16515                         Type::getInt8PtrTy(M.getContext()));
16516 
16517     // MSVC CRT has a function to validate security cookie.
16518     FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
16519         "__security_check_cookie", Type::getVoidTy(M.getContext()),
16520         Type::getInt8PtrTy(M.getContext()));
16521     if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
16522       F->setCallingConv(CallingConv::Win64);
16523       F->addAttribute(1, Attribute::AttrKind::InReg);
16524     }
16525     return;
16526   }
16527   TargetLowering::insertSSPDeclarations(M);
16528 }
16529 
16530 Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const {
16531   // MSVC CRT has a global variable holding security cookie.
16532   if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
16533     return M.getGlobalVariable("__security_cookie");
16534   return TargetLowering::getSDagStackGuard(M);
16535 }
16536 
16537 Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
16538   // MSVC CRT has a function to validate security cookie.
16539   if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
16540     return M.getFunction("__security_check_cookie");
16541   return TargetLowering::getSSPStackGuardCheck(M);
16542 }
16543 
16544 Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
16545   // Android provides a fixed TLS slot for the SafeStack pointer. See the
16546   // definition of TLS_SLOT_SAFESTACK in
16547   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
16548   if (Subtarget->isTargetAndroid())
16549     return UseTlsOffset(IRB, 0x48);
16550 
16551   // Fuchsia is similar.
16552   // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
16553   if (Subtarget->isTargetFuchsia())
16554     return UseTlsOffset(IRB, -0x8);
16555 
16556   return TargetLowering::getSafeStackPointerLocation(IRB);
16557 }
16558 
16559 bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
16560     const Instruction &AndI) const {
16561   // Only sink 'and' mask to cmp use block if it is masking a single bit, since
16562   // this is likely to be fold the and/cmp/br into a single tbz instruction.  It
16563   // may be beneficial to sink in other cases, but we would have to check that
16564   // the cmp would not get folded into the br to form a cbz for these to be
16565   // beneficial.
16566   ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
16567   if (!Mask)
16568     return false;
16569   return Mask->getValue().isPowerOf2();
16570 }
16571 
16572 bool AArch64TargetLowering::
16573     shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
16574         SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
16575         unsigned OldShiftOpcode, unsigned NewShiftOpcode,
16576         SelectionDAG &DAG) const {
16577   // Does baseline recommend not to perform the fold by default?
16578   if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
16579           X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
16580     return false;
16581   // Else, if this is a vector shift, prefer 'shl'.
16582   return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
16583 }
16584 
16585 bool AArch64TargetLowering::shouldExpandShift(SelectionDAG &DAG,
16586                                               SDNode *N) const {
16587   if (DAG.getMachineFunction().getFunction().hasMinSize() &&
16588       !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
16589     return false;
16590   return true;
16591 }
16592 
16593 void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
16594   // Update IsSplitCSR in AArch64unctionInfo.
16595   AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
16596   AFI->setIsSplitCSR(true);
16597 }
16598 
16599 void AArch64TargetLowering::insertCopiesSplitCSR(
16600     MachineBasicBlock *Entry,
16601     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
16602   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
16603   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
16604   if (!IStart)
16605     return;
16606 
16607   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
16608   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
16609   MachineBasicBlock::iterator MBBI = Entry->begin();
16610   for (const MCPhysReg *I = IStart; *I; ++I) {
16611     const TargetRegisterClass *RC = nullptr;
16612     if (AArch64::GPR64RegClass.contains(*I))
16613       RC = &AArch64::GPR64RegClass;
16614     else if (AArch64::FPR64RegClass.contains(*I))
16615       RC = &AArch64::FPR64RegClass;
16616     else
16617       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
16618 
16619     Register NewVR = MRI->createVirtualRegister(RC);
16620     // Create copy from CSR to a virtual register.
16621     // FIXME: this currently does not emit CFI pseudo-instructions, it works
16622     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
16623     // nounwind. If we want to generalize this later, we may need to emit
16624     // CFI pseudo-instructions.
16625     assert(Entry->getParent()->getFunction().hasFnAttribute(
16626                Attribute::NoUnwind) &&
16627            "Function should be nounwind in insertCopiesSplitCSR!");
16628     Entry->addLiveIn(*I);
16629     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
16630         .addReg(*I);
16631 
16632     // Insert the copy-back instructions right before the terminator.
16633     for (auto *Exit : Exits)
16634       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
16635               TII->get(TargetOpcode::COPY), *I)
16636           .addReg(NewVR);
16637   }
16638 }
16639 
16640 bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
16641   // Integer division on AArch64 is expensive. However, when aggressively
16642   // optimizing for code size, we prefer to use a div instruction, as it is
16643   // usually smaller than the alternative sequence.
16644   // The exception to this is vector division. Since AArch64 doesn't have vector
16645   // integer division, leaving the division as-is is a loss even in terms of
16646   // size, because it will have to be scalarized, while the alternative code
16647   // sequence can be performed in vector form.
16648   bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
16649   return OptSize && !VT.isVector();
16650 }
16651 
16652 bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
16653   // We want inc-of-add for scalars and sub-of-not for vectors.
16654   return VT.isScalarInteger();
16655 }
16656 
16657 bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
16658   return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
16659 }
16660 
16661 unsigned
16662 AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
16663   if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
16664     return getPointerTy(DL).getSizeInBits();
16665 
16666   return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
16667 }
16668 
16669 void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
16670   MF.getFrameInfo().computeMaxCallFrameSize(MF);
16671   TargetLoweringBase::finalizeLowering(MF);
16672 }
16673 
16674 // Unlike X86, we let frame lowering assign offsets to all catch objects.
16675 bool AArch64TargetLowering::needsFixedCatchObjects() const {
16676   return false;
16677 }
16678 
16679 bool AArch64TargetLowering::shouldLocalize(
16680     const MachineInstr &MI, const TargetTransformInfo *TTI) const {
16681   switch (MI.getOpcode()) {
16682   case TargetOpcode::G_GLOBAL_VALUE: {
16683     // On Darwin, TLS global vars get selected into function calls, which
16684     // we don't want localized, as they can get moved into the middle of a
16685     // another call sequence.
16686     const GlobalValue &GV = *MI.getOperand(1).getGlobal();
16687     if (GV.isThreadLocal() && Subtarget->isTargetMachO())
16688       return false;
16689     break;
16690   }
16691   // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
16692   // localizable.
16693   case AArch64::ADRP:
16694   case AArch64::G_ADD_LOW:
16695     return true;
16696   default:
16697     break;
16698   }
16699   return TargetLoweringBase::shouldLocalize(MI, TTI);
16700 }
16701 
16702 bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
16703   if (isa<ScalableVectorType>(Inst.getType()))
16704     return true;
16705 
16706   for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
16707     if (isa<ScalableVectorType>(Inst.getOperand(i)->getType()))
16708       return true;
16709 
16710   if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
16711     if (isa<ScalableVectorType>(AI->getAllocatedType()))
16712       return true;
16713   }
16714 
16715   return false;
16716 }
16717 
16718 // Return the largest legal scalable vector type that matches VT's element type.
16719 static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {
16720   assert(VT.isFixedLengthVector() &&
16721          DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
16722          "Expected legal fixed length vector!");
16723   switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
16724   default:
16725     llvm_unreachable("unexpected element type for SVE container");
16726   case MVT::i8:
16727     return EVT(MVT::nxv16i8);
16728   case MVT::i16:
16729     return EVT(MVT::nxv8i16);
16730   case MVT::i32:
16731     return EVT(MVT::nxv4i32);
16732   case MVT::i64:
16733     return EVT(MVT::nxv2i64);
16734   case MVT::f16:
16735     return EVT(MVT::nxv8f16);
16736   case MVT::f32:
16737     return EVT(MVT::nxv4f32);
16738   case MVT::f64:
16739     return EVT(MVT::nxv2f64);
16740   }
16741 }
16742 
16743 // Return a PTRUE with active lanes corresponding to the extent of VT.
16744 static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
16745                                                 EVT VT) {
16746   assert(VT.isFixedLengthVector() &&
16747          DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
16748          "Expected legal fixed length vector!");
16749 
16750   int PgPattern;
16751   switch (VT.getVectorNumElements()) {
16752   default:
16753     llvm_unreachable("unexpected element count for SVE predicate");
16754   case 1:
16755     PgPattern = AArch64SVEPredPattern::vl1;
16756     break;
16757   case 2:
16758     PgPattern = AArch64SVEPredPattern::vl2;
16759     break;
16760   case 4:
16761     PgPattern = AArch64SVEPredPattern::vl4;
16762     break;
16763   case 8:
16764     PgPattern = AArch64SVEPredPattern::vl8;
16765     break;
16766   case 16:
16767     PgPattern = AArch64SVEPredPattern::vl16;
16768     break;
16769   case 32:
16770     PgPattern = AArch64SVEPredPattern::vl32;
16771     break;
16772   case 64:
16773     PgPattern = AArch64SVEPredPattern::vl64;
16774     break;
16775   case 128:
16776     PgPattern = AArch64SVEPredPattern::vl128;
16777     break;
16778   case 256:
16779     PgPattern = AArch64SVEPredPattern::vl256;
16780     break;
16781   }
16782 
16783   // TODO: For vectors that are exactly getMaxSVEVectorSizeInBits big, we can
16784   // use AArch64SVEPredPattern::all, which can enable the use of unpredicated
16785   // variants of instructions when available.
16786 
16787   MVT MaskVT;
16788   switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
16789   default:
16790     llvm_unreachable("unexpected element type for SVE predicate");
16791   case MVT::i8:
16792     MaskVT = MVT::nxv16i1;
16793     break;
16794   case MVT::i16:
16795   case MVT::f16:
16796     MaskVT = MVT::nxv8i1;
16797     break;
16798   case MVT::i32:
16799   case MVT::f32:
16800     MaskVT = MVT::nxv4i1;
16801     break;
16802   case MVT::i64:
16803   case MVT::f64:
16804     MaskVT = MVT::nxv2i1;
16805     break;
16806   }
16807 
16808   return DAG.getNode(AArch64ISD::PTRUE, DL, MaskVT,
16809                      DAG.getTargetConstant(PgPattern, DL, MVT::i64));
16810 }
16811 
16812 static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
16813                                              EVT VT) {
16814   assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
16815          "Expected legal scalable vector!");
16816   auto PredTy = VT.changeVectorElementType(MVT::i1);
16817   return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
16818 }
16819 
16820 static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) {
16821   if (VT.isFixedLengthVector())
16822     return getPredicateForFixedLengthVector(DAG, DL, VT);
16823 
16824   return getPredicateForScalableVector(DAG, DL, VT);
16825 }
16826 
16827 // Grow V to consume an entire SVE register.
16828 static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
16829   assert(VT.isScalableVector() &&
16830          "Expected to convert into a scalable vector!");
16831   assert(V.getValueType().isFixedLengthVector() &&
16832          "Expected a fixed length vector operand!");
16833   SDLoc DL(V);
16834   SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
16835   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
16836 }
16837 
16838 // Shrink V so it's just big enough to maintain a VT's worth of data.
16839 static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
16840   assert(VT.isFixedLengthVector() &&
16841          "Expected to convert into a fixed length vector!");
16842   assert(V.getValueType().isScalableVector() &&
16843          "Expected a scalable vector operand!");
16844   SDLoc DL(V);
16845   SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
16846   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
16847 }
16848 
16849 // Convert all fixed length vector loads larger than NEON to masked_loads.
16850 SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
16851     SDValue Op, SelectionDAG &DAG) const {
16852   auto Load = cast<LoadSDNode>(Op);
16853 
16854   SDLoc DL(Op);
16855   EVT VT = Op.getValueType();
16856   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
16857 
16858   auto NewLoad = DAG.getMaskedLoad(
16859       ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
16860       getPredicateForFixedLengthVector(DAG, DL, VT), DAG.getUNDEF(ContainerVT),
16861       Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(),
16862       Load->getExtensionType());
16863 
16864   auto Result = convertFromScalableVector(DAG, VT, NewLoad);
16865   SDValue MergedValues[2] = {Result, Load->getChain()};
16866   return DAG.getMergeValues(MergedValues, DL);
16867 }
16868 
16869 // Convert all fixed length vector stores larger than NEON to masked_stores.
16870 SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
16871     SDValue Op, SelectionDAG &DAG) const {
16872   auto Store = cast<StoreSDNode>(Op);
16873 
16874   SDLoc DL(Op);
16875   EVT VT = Store->getValue().getValueType();
16876   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
16877 
16878   auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
16879   return DAG.getMaskedStore(
16880       Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
16881       getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(),
16882       Store->getMemOperand(), Store->getAddressingMode(),
16883       Store->isTruncatingStore());
16884 }
16885 
16886 SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
16887     SDValue Op, SelectionDAG &DAG) const {
16888   SDLoc dl(Op);
16889   EVT VT = Op.getValueType();
16890   EVT EltVT = VT.getVectorElementType();
16891 
16892   bool Signed = Op.getOpcode() == ISD::SDIV;
16893   unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
16894 
16895   // Scalable vector i32/i64 DIV is supported.
16896   if (EltVT == MVT::i32 || EltVT == MVT::i64)
16897     return LowerToPredicatedOp(Op, DAG, PredOpcode, /*OverrideNEON=*/true);
16898 
16899   // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
16900   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
16901   EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
16902   EVT FixedWidenedVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
16903   EVT ScalableWidenedVT = getContainerForFixedLengthVector(DAG, FixedWidenedVT);
16904 
16905   // Convert the operands to scalable vectors.
16906   SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
16907   SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
16908 
16909   // Extend the scalable operands.
16910   unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
16911   unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
16912   SDValue Op0Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op0);
16913   SDValue Op1Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op1);
16914   SDValue Op0Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op0);
16915   SDValue Op1Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op1);
16916 
16917   // Convert back to fixed vectors so the DIV can be further lowered.
16918   Op0Lo = convertFromScalableVector(DAG, FixedWidenedVT, Op0Lo);
16919   Op1Lo = convertFromScalableVector(DAG, FixedWidenedVT, Op1Lo);
16920   Op0Hi = convertFromScalableVector(DAG, FixedWidenedVT, Op0Hi);
16921   Op1Hi = convertFromScalableVector(DAG, FixedWidenedVT, Op1Hi);
16922   SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT,
16923                                  Op0Lo, Op1Lo);
16924   SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT,
16925                                  Op0Hi, Op1Hi);
16926 
16927   // Convert again to scalable vectors to truncate.
16928   ResultLo = convertToScalableVector(DAG, ScalableWidenedVT, ResultLo);
16929   ResultHi = convertToScalableVector(DAG, ScalableWidenedVT, ResultHi);
16930   SDValue ScalableResult = DAG.getNode(AArch64ISD::UZP1, dl, ContainerVT,
16931                                        ResultLo, ResultHi);
16932 
16933   return convertFromScalableVector(DAG, VT, ScalableResult);
16934 }
16935 
16936 SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
16937     SDValue Op, SelectionDAG &DAG) const {
16938   EVT VT = Op.getValueType();
16939   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
16940 
16941   SDLoc DL(Op);
16942   SDValue Val = Op.getOperand(0);
16943   EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
16944   Val = convertToScalableVector(DAG, ContainerVT, Val);
16945 
16946   bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
16947   unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
16948 
16949   // Repeatedly unpack Val until the result is of the desired element type.
16950   switch (ContainerVT.getSimpleVT().SimpleTy) {
16951   default:
16952     llvm_unreachable("unimplemented container type");
16953   case MVT::nxv16i8:
16954     Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
16955     if (VT.getVectorElementType() == MVT::i16)
16956       break;
16957     LLVM_FALLTHROUGH;
16958   case MVT::nxv8i16:
16959     Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
16960     if (VT.getVectorElementType() == MVT::i32)
16961       break;
16962     LLVM_FALLTHROUGH;
16963   case MVT::nxv4i32:
16964     Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
16965     assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
16966     break;
16967   }
16968 
16969   return convertFromScalableVector(DAG, VT, Val);
16970 }
16971 
16972 SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
16973     SDValue Op, SelectionDAG &DAG) const {
16974   EVT VT = Op.getValueType();
16975   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
16976 
16977   SDLoc DL(Op);
16978   SDValue Val = Op.getOperand(0);
16979   EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
16980   Val = convertToScalableVector(DAG, ContainerVT, Val);
16981 
16982   // Repeatedly truncate Val until the result is of the desired element type.
16983   switch (ContainerVT.getSimpleVT().SimpleTy) {
16984   default:
16985     llvm_unreachable("unimplemented container type");
16986   case MVT::nxv2i64:
16987     Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
16988     Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
16989     if (VT.getVectorElementType() == MVT::i32)
16990       break;
16991     LLVM_FALLTHROUGH;
16992   case MVT::nxv4i32:
16993     Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
16994     Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
16995     if (VT.getVectorElementType() == MVT::i16)
16996       break;
16997     LLVM_FALLTHROUGH;
16998   case MVT::nxv8i16:
16999     Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
17000     Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
17001     assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
17002     break;
17003   }
17004 
17005   return convertFromScalableVector(DAG, VT, Val);
17006 }
17007 
17008 // Convert vector operation 'Op' to an equivalent predicated operation whereby
17009 // the original operation's type is used to construct a suitable predicate.
17010 // NOTE: The results for inactive lanes are undefined.
17011 SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
17012                                                    SelectionDAG &DAG,
17013                                                    unsigned NewOp,
17014                                                    bool OverrideNEON) const {
17015   EVT VT = Op.getValueType();
17016   SDLoc DL(Op);
17017   auto Pg = getPredicateForVector(DAG, DL, VT);
17018 
17019   if (useSVEForFixedLengthVectorVT(VT, OverrideNEON)) {
17020     EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
17021 
17022     // Create list of operands by converting existing ones to scalable types.
17023     SmallVector<SDValue, 4> Operands = {Pg};
17024     for (const SDValue &V : Op->op_values()) {
17025       if (isa<CondCodeSDNode>(V)) {
17026         Operands.push_back(V);
17027         continue;
17028       }
17029 
17030       if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
17031         EVT VTArg = VTNode->getVT().getVectorElementType();
17032         EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
17033         Operands.push_back(DAG.getValueType(NewVTArg));
17034         continue;
17035       }
17036 
17037       assert(useSVEForFixedLengthVectorVT(V.getValueType(), OverrideNEON) &&
17038              "Only fixed length vectors are supported!");
17039       Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
17040     }
17041 
17042     if (isMergePassthruOpcode(NewOp))
17043       Operands.push_back(DAG.getUNDEF(ContainerVT));
17044 
17045     auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
17046     return convertFromScalableVector(DAG, VT, ScalableRes);
17047   }
17048 
17049   assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
17050 
17051   SmallVector<SDValue, 4> Operands = {Pg};
17052   for (const SDValue &V : Op->op_values()) {
17053     assert((!V.getValueType().isVector() ||
17054             V.getValueType().isScalableVector()) &&
17055            "Only scalable vectors are supported!");
17056     Operands.push_back(V);
17057   }
17058 
17059   if (isMergePassthruOpcode(NewOp))
17060     Operands.push_back(DAG.getUNDEF(VT));
17061 
17062   return DAG.getNode(NewOp, DL, VT, Operands);
17063 }
17064 
17065 // If a fixed length vector operation has no side effects when applied to
17066 // undefined elements, we can safely use scalable vectors to perform the same
17067 // operation without needing to worry about predication.
17068 SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
17069                                                  SelectionDAG &DAG) const {
17070   EVT VT = Op.getValueType();
17071   assert(useSVEForFixedLengthVectorVT(VT) &&
17072          "Only expected to lower fixed length vector operation!");
17073   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
17074 
17075   // Create list of operands by converting existing ones to scalable types.
17076   SmallVector<SDValue, 4> Ops;
17077   for (const SDValue &V : Op->op_values()) {
17078     assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
17079 
17080     // Pass through non-vector operands.
17081     if (!V.getValueType().isVector()) {
17082       Ops.push_back(V);
17083       continue;
17084     }
17085 
17086     // "cast" fixed length vector to a scalable vector.
17087     assert(useSVEForFixedLengthVectorVT(V.getValueType()) &&
17088            "Only fixed length vectors are supported!");
17089     Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
17090   }
17091 
17092   auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
17093   return convertFromScalableVector(DAG, VT, ScalableRes);
17094 }
17095 
17096 SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
17097     SelectionDAG &DAG) const {
17098   SDLoc DL(ScalarOp);
17099   SDValue AccOp = ScalarOp.getOperand(0);
17100   SDValue VecOp = ScalarOp.getOperand(1);
17101   EVT SrcVT = VecOp.getValueType();
17102   EVT ResVT = SrcVT.getVectorElementType();
17103 
17104   EVT ContainerVT = SrcVT;
17105   if (SrcVT.isFixedLengthVector()) {
17106     ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
17107     VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
17108   }
17109 
17110   SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
17111   SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
17112 
17113   // Convert operands to Scalable.
17114   AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
17115                       DAG.getUNDEF(ContainerVT), AccOp, Zero);
17116 
17117   // Perform reduction.
17118   SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
17119                             Pg, AccOp, VecOp);
17120 
17121   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
17122 }
17123 
17124 SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
17125                                                        SelectionDAG &DAG) const {
17126   SDLoc DL(ReduceOp);
17127   SDValue Op = ReduceOp.getOperand(0);
17128   EVT OpVT = Op.getValueType();
17129   EVT VT = ReduceOp.getValueType();
17130 
17131   if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
17132     return SDValue();
17133 
17134   SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
17135 
17136   switch (ReduceOp.getOpcode()) {
17137   default:
17138     return SDValue();
17139   case ISD::VECREDUCE_OR:
17140     return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
17141   case ISD::VECREDUCE_AND: {
17142     Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
17143     return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
17144   }
17145   case ISD::VECREDUCE_XOR: {
17146     SDValue ID =
17147         DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
17148     SDValue Cntp =
17149         DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
17150     return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
17151   }
17152   }
17153 
17154   return SDValue();
17155 }
17156 
17157 SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
17158                                                    SDValue ScalarOp,
17159                                                    SelectionDAG &DAG) const {
17160   SDLoc DL(ScalarOp);
17161   SDValue VecOp = ScalarOp.getOperand(0);
17162   EVT SrcVT = VecOp.getValueType();
17163 
17164   if (useSVEForFixedLengthVectorVT(SrcVT, true)) {
17165     EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
17166     VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
17167   }
17168 
17169   // UADDV always returns an i64 result.
17170   EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
17171                                                    SrcVT.getVectorElementType();
17172   EVT RdxVT = SrcVT;
17173   if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
17174     RdxVT = getPackedSVEVectorVT(ResVT);
17175 
17176   SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
17177   SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
17178   SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
17179                             Rdx, DAG.getConstant(0, DL, MVT::i64));
17180 
17181   // The VEC_REDUCE nodes expect an element size result.
17182   if (ResVT != ScalarOp.getValueType())
17183     Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
17184 
17185   return Res;
17186 }
17187 
17188 SDValue
17189 AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
17190     SelectionDAG &DAG) const {
17191   EVT VT = Op.getValueType();
17192   SDLoc DL(Op);
17193 
17194   EVT InVT = Op.getOperand(1).getValueType();
17195   EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
17196   SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
17197   SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
17198 
17199   // Convert the mask to a predicated (NOTE: We don't need to worry about
17200   // inactive lanes since VSELECT is safe when given undefined elements).
17201   EVT MaskVT = Op.getOperand(0).getValueType();
17202   EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
17203   auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
17204   Mask = DAG.getNode(ISD::TRUNCATE, DL,
17205                      MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
17206 
17207   auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
17208                                 Mask, Op1, Op2);
17209 
17210   return convertFromScalableVector(DAG, VT, ScalableRes);
17211 }
17212 
17213 SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
17214     SDValue Op, SelectionDAG &DAG) const {
17215   SDLoc DL(Op);
17216   EVT InVT = Op.getOperand(0).getValueType();
17217   EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
17218 
17219   assert(useSVEForFixedLengthVectorVT(InVT) &&
17220          "Only expected to lower fixed length vector operation!");
17221   assert(Op.getValueType() == InVT.changeTypeToInteger() &&
17222          "Expected integer result of the same bit length as the inputs!");
17223 
17224   // Expand floating point vector comparisons.
17225   if (InVT.isFloatingPoint())
17226     return SDValue();
17227 
17228   auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
17229   auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
17230   auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
17231 
17232   EVT CmpVT = Pg.getValueType();
17233   auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
17234                          {Pg, Op1, Op2, Op.getOperand(2)});
17235 
17236   EVT PromoteVT = ContainerVT.changeTypeToInteger();
17237   auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
17238   return convertFromScalableVector(DAG, Op.getValueType(), Promote);
17239 }
17240 
17241 SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
17242                                                  SelectionDAG &DAG) const {
17243   SDLoc DL(Op);
17244   EVT InVT = Op.getValueType();
17245   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17246   (void)TLI;
17247 
17248   assert(VT.isScalableVector() && TLI.isTypeLegal(VT) &&
17249          InVT.isScalableVector() && TLI.isTypeLegal(InVT) &&
17250          "Only expect to cast between legal scalable vector types!");
17251   assert((VT.getVectorElementType() == MVT::i1) ==
17252              (InVT.getVectorElementType() == MVT::i1) &&
17253          "Cannot cast between data and predicate scalable vector types!");
17254 
17255   if (InVT == VT)
17256     return Op;
17257 
17258   if (VT.getVectorElementType() == MVT::i1)
17259     return DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
17260 
17261   EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType());
17262   EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
17263   assert((VT == PackedVT || InVT == PackedInVT) &&
17264          "Cannot cast between unpacked scalable vector types!");
17265 
17266   // Pack input if required.
17267   if (InVT != PackedInVT)
17268     Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
17269 
17270   Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
17271 
17272   // Unpack result if required.
17273   if (VT != PackedVT)
17274     Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
17275 
17276   return Op;
17277 }
17278