xref: /freebsd/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp (revision a3266ba2697a383d2ede56803320d941866c7e76)
1 //===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the interfaces that ARM uses to lower LLVM code into a
10 // selection DAG.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "ARMISelLowering.h"
15 #include "ARMBaseInstrInfo.h"
16 #include "ARMBaseRegisterInfo.h"
17 #include "ARMCallingConv.h"
18 #include "ARMConstantPoolValue.h"
19 #include "ARMMachineFunctionInfo.h"
20 #include "ARMPerfectShuffle.h"
21 #include "ARMRegisterInfo.h"
22 #include "ARMSelectionDAGInfo.h"
23 #include "ARMSubtarget.h"
24 #include "MCTargetDesc/ARMAddressingModes.h"
25 #include "MCTargetDesc/ARMBaseInfo.h"
26 #include "Utils/ARMBaseInfo.h"
27 #include "llvm/ADT/APFloat.h"
28 #include "llvm/ADT/APInt.h"
29 #include "llvm/ADT/ArrayRef.h"
30 #include "llvm/ADT/BitVector.h"
31 #include "llvm/ADT/DenseMap.h"
32 #include "llvm/ADT/STLExtras.h"
33 #include "llvm/ADT/SmallPtrSet.h"
34 #include "llvm/ADT/SmallVector.h"
35 #include "llvm/ADT/Statistic.h"
36 #include "llvm/ADT/StringExtras.h"
37 #include "llvm/ADT/StringRef.h"
38 #include "llvm/ADT/StringSwitch.h"
39 #include "llvm/ADT/Triple.h"
40 #include "llvm/ADT/Twine.h"
41 #include "llvm/Analysis/VectorUtils.h"
42 #include "llvm/CodeGen/CallingConvLower.h"
43 #include "llvm/CodeGen/ISDOpcodes.h"
44 #include "llvm/CodeGen/IntrinsicLowering.h"
45 #include "llvm/CodeGen/MachineBasicBlock.h"
46 #include "llvm/CodeGen/MachineConstantPool.h"
47 #include "llvm/CodeGen/MachineFrameInfo.h"
48 #include "llvm/CodeGen/MachineFunction.h"
49 #include "llvm/CodeGen/MachineInstr.h"
50 #include "llvm/CodeGen/MachineInstrBuilder.h"
51 #include "llvm/CodeGen/MachineJumpTableInfo.h"
52 #include "llvm/CodeGen/MachineMemOperand.h"
53 #include "llvm/CodeGen/MachineOperand.h"
54 #include "llvm/CodeGen/MachineRegisterInfo.h"
55 #include "llvm/CodeGen/RuntimeLibcalls.h"
56 #include "llvm/CodeGen/SelectionDAG.h"
57 #include "llvm/CodeGen/SelectionDAGNodes.h"
58 #include "llvm/CodeGen/TargetInstrInfo.h"
59 #include "llvm/CodeGen/TargetLowering.h"
60 #include "llvm/CodeGen/TargetOpcodes.h"
61 #include "llvm/CodeGen/TargetRegisterInfo.h"
62 #include "llvm/CodeGen/TargetSubtargetInfo.h"
63 #include "llvm/CodeGen/ValueTypes.h"
64 #include "llvm/IR/Attributes.h"
65 #include "llvm/IR/CallingConv.h"
66 #include "llvm/IR/Constant.h"
67 #include "llvm/IR/Constants.h"
68 #include "llvm/IR/DataLayout.h"
69 #include "llvm/IR/DebugLoc.h"
70 #include "llvm/IR/DerivedTypes.h"
71 #include "llvm/IR/Function.h"
72 #include "llvm/IR/GlobalAlias.h"
73 #include "llvm/IR/GlobalValue.h"
74 #include "llvm/IR/GlobalVariable.h"
75 #include "llvm/IR/IRBuilder.h"
76 #include "llvm/IR/InlineAsm.h"
77 #include "llvm/IR/Instruction.h"
78 #include "llvm/IR/Instructions.h"
79 #include "llvm/IR/IntrinsicInst.h"
80 #include "llvm/IR/Intrinsics.h"
81 #include "llvm/IR/IntrinsicsARM.h"
82 #include "llvm/IR/Module.h"
83 #include "llvm/IR/PatternMatch.h"
84 #include "llvm/IR/Type.h"
85 #include "llvm/IR/User.h"
86 #include "llvm/IR/Value.h"
87 #include "llvm/MC/MCInstrDesc.h"
88 #include "llvm/MC/MCInstrItineraries.h"
89 #include "llvm/MC/MCRegisterInfo.h"
90 #include "llvm/MC/MCSchedule.h"
91 #include "llvm/Support/AtomicOrdering.h"
92 #include "llvm/Support/BranchProbability.h"
93 #include "llvm/Support/Casting.h"
94 #include "llvm/Support/CodeGen.h"
95 #include "llvm/Support/CommandLine.h"
96 #include "llvm/Support/Compiler.h"
97 #include "llvm/Support/Debug.h"
98 #include "llvm/Support/ErrorHandling.h"
99 #include "llvm/Support/KnownBits.h"
100 #include "llvm/Support/MachineValueType.h"
101 #include "llvm/Support/MathExtras.h"
102 #include "llvm/Support/raw_ostream.h"
103 #include "llvm/Target/TargetMachine.h"
104 #include "llvm/Target/TargetOptions.h"
105 #include <algorithm>
106 #include <cassert>
107 #include <cstdint>
108 #include <cstdlib>
109 #include <iterator>
110 #include <limits>
111 #include <string>
112 #include <tuple>
113 #include <utility>
114 #include <vector>
115 
116 using namespace llvm;
117 using namespace llvm::PatternMatch;
118 
119 #define DEBUG_TYPE "arm-isel"
120 
121 STATISTIC(NumTailCalls, "Number of tail calls");
122 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
123 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
124 STATISTIC(NumConstpoolPromoted,
125   "Number of constants with their storage promoted into constant pools");
126 
127 static cl::opt<bool>
128 ARMInterworking("arm-interworking", cl::Hidden,
129   cl::desc("Enable / disable ARM interworking (for debugging only)"),
130   cl::init(true));
131 
132 static cl::opt<bool> EnableConstpoolPromotion(
133     "arm-promote-constant", cl::Hidden,
134     cl::desc("Enable / disable promotion of unnamed_addr constants into "
135              "constant pools"),
136     cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
137 static cl::opt<unsigned> ConstpoolPromotionMaxSize(
138     "arm-promote-constant-max-size", cl::Hidden,
139     cl::desc("Maximum size of constant to promote into a constant pool"),
140     cl::init(64));
141 static cl::opt<unsigned> ConstpoolPromotionMaxTotal(
142     "arm-promote-constant-max-total", cl::Hidden,
143     cl::desc("Maximum size of ALL constants to promote into a constant pool"),
144     cl::init(128));
145 
146 cl::opt<unsigned>
147 MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
148   cl::desc("Maximum interleave factor for MVE VLDn to generate."),
149   cl::init(2));
150 
151 // The APCS parameter registers.
152 static const MCPhysReg GPRArgRegs[] = {
153   ARM::R0, ARM::R1, ARM::R2, ARM::R3
154 };
155 
156 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
157                                        MVT PromotedBitwiseVT) {
158   if (VT != PromotedLdStVT) {
159     setOperationAction(ISD::LOAD, VT, Promote);
160     AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
161 
162     setOperationAction(ISD::STORE, VT, Promote);
163     AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
164   }
165 
166   MVT ElemTy = VT.getVectorElementType();
167   if (ElemTy != MVT::f64)
168     setOperationAction(ISD::SETCC, VT, Custom);
169   setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
170   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
171   if (ElemTy == MVT::i32) {
172     setOperationAction(ISD::SINT_TO_FP, VT, Custom);
173     setOperationAction(ISD::UINT_TO_FP, VT, Custom);
174     setOperationAction(ISD::FP_TO_SINT, VT, Custom);
175     setOperationAction(ISD::FP_TO_UINT, VT, Custom);
176   } else {
177     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
178     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
179     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
180     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
181   }
182   setOperationAction(ISD::BUILD_VECTOR,      VT, Custom);
183   setOperationAction(ISD::VECTOR_SHUFFLE,    VT, Custom);
184   setOperationAction(ISD::CONCAT_VECTORS,    VT, Legal);
185   setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
186   setOperationAction(ISD::SELECT,            VT, Expand);
187   setOperationAction(ISD::SELECT_CC,         VT, Expand);
188   setOperationAction(ISD::VSELECT,           VT, Expand);
189   setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
190   if (VT.isInteger()) {
191     setOperationAction(ISD::SHL, VT, Custom);
192     setOperationAction(ISD::SRA, VT, Custom);
193     setOperationAction(ISD::SRL, VT, Custom);
194   }
195 
196   // Promote all bit-wise operations.
197   if (VT.isInteger() && VT != PromotedBitwiseVT) {
198     setOperationAction(ISD::AND, VT, Promote);
199     AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT);
200     setOperationAction(ISD::OR,  VT, Promote);
201     AddPromotedToType (ISD::OR,  VT, PromotedBitwiseVT);
202     setOperationAction(ISD::XOR, VT, Promote);
203     AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT);
204   }
205 
206   // Neon does not support vector divide/remainder operations.
207   setOperationAction(ISD::SDIV, VT, Expand);
208   setOperationAction(ISD::UDIV, VT, Expand);
209   setOperationAction(ISD::FDIV, VT, Expand);
210   setOperationAction(ISD::SREM, VT, Expand);
211   setOperationAction(ISD::UREM, VT, Expand);
212   setOperationAction(ISD::FREM, VT, Expand);
213   setOperationAction(ISD::SDIVREM, VT, Expand);
214   setOperationAction(ISD::UDIVREM, VT, Expand);
215 
216   if (!VT.isFloatingPoint() &&
217       VT != MVT::v2i64 && VT != MVT::v1i64)
218     for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
219       setOperationAction(Opcode, VT, Legal);
220   if (!VT.isFloatingPoint())
221     for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
222       setOperationAction(Opcode, VT, Legal);
223 }
224 
225 void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
226   addRegisterClass(VT, &ARM::DPRRegClass);
227   addTypeForNEON(VT, MVT::f64, MVT::v2i32);
228 }
229 
230 void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
231   addRegisterClass(VT, &ARM::DPairRegClass);
232   addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
233 }
234 
235 void ARMTargetLowering::setAllExpand(MVT VT) {
236   for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
237     setOperationAction(Opc, VT, Expand);
238 
239   // We support these really simple operations even on types where all
240   // the actual arithmetic has to be broken down into simpler
241   // operations or turned into library calls.
242   setOperationAction(ISD::BITCAST, VT, Legal);
243   setOperationAction(ISD::LOAD, VT, Legal);
244   setOperationAction(ISD::STORE, VT, Legal);
245   setOperationAction(ISD::UNDEF, VT, Legal);
246 }
247 
248 void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
249                                        LegalizeAction Action) {
250   setLoadExtAction(ISD::EXTLOAD,  From, To, Action);
251   setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
252   setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
253 }
254 
255 void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
256   const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
257 
258   for (auto VT : IntTypes) {
259     addRegisterClass(VT, &ARM::MQPRRegClass);
260     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
261     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
262     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
263     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
264     setOperationAction(ISD::SHL, VT, Custom);
265     setOperationAction(ISD::SRA, VT, Custom);
266     setOperationAction(ISD::SRL, VT, Custom);
267     setOperationAction(ISD::SMIN, VT, Legal);
268     setOperationAction(ISD::SMAX, VT, Legal);
269     setOperationAction(ISD::UMIN, VT, Legal);
270     setOperationAction(ISD::UMAX, VT, Legal);
271     setOperationAction(ISD::ABS, VT, Legal);
272     setOperationAction(ISD::SETCC, VT, Custom);
273     setOperationAction(ISD::MLOAD, VT, Custom);
274     setOperationAction(ISD::MSTORE, VT, Legal);
275     setOperationAction(ISD::CTLZ, VT, Legal);
276     setOperationAction(ISD::CTTZ, VT, Custom);
277     setOperationAction(ISD::BITREVERSE, VT, Legal);
278     setOperationAction(ISD::BSWAP, VT, Legal);
279     setOperationAction(ISD::SADDSAT, VT, Legal);
280     setOperationAction(ISD::UADDSAT, VT, Legal);
281     setOperationAction(ISD::SSUBSAT, VT, Legal);
282     setOperationAction(ISD::USUBSAT, VT, Legal);
283 
284     // No native support for these.
285     setOperationAction(ISD::UDIV, VT, Expand);
286     setOperationAction(ISD::SDIV, VT, Expand);
287     setOperationAction(ISD::UREM, VT, Expand);
288     setOperationAction(ISD::SREM, VT, Expand);
289     setOperationAction(ISD::UDIVREM, VT, Expand);
290     setOperationAction(ISD::SDIVREM, VT, Expand);
291     setOperationAction(ISD::CTPOP, VT, Expand);
292     setOperationAction(ISD::SELECT, VT, Expand);
293     setOperationAction(ISD::SELECT_CC, VT, Expand);
294 
295     // Vector reductions
296     setOperationAction(ISD::VECREDUCE_ADD, VT, Legal);
297     setOperationAction(ISD::VECREDUCE_SMAX, VT, Legal);
298     setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal);
299     setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal);
300     setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal);
301     setOperationAction(ISD::VECREDUCE_MUL, VT, Custom);
302     setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
303     setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
304     setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
305 
306     if (!HasMVEFP) {
307       setOperationAction(ISD::SINT_TO_FP, VT, Expand);
308       setOperationAction(ISD::UINT_TO_FP, VT, Expand);
309       setOperationAction(ISD::FP_TO_SINT, VT, Expand);
310       setOperationAction(ISD::FP_TO_UINT, VT, Expand);
311     }
312 
313     // Pre and Post inc are supported on loads and stores
314     for (unsigned im = (unsigned)ISD::PRE_INC;
315          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
316       setIndexedLoadAction(im, VT, Legal);
317       setIndexedStoreAction(im, VT, Legal);
318       setIndexedMaskedLoadAction(im, VT, Legal);
319       setIndexedMaskedStoreAction(im, VT, Legal);
320     }
321   }
322 
323   const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
324   for (auto VT : FloatTypes) {
325     addRegisterClass(VT, &ARM::MQPRRegClass);
326     if (!HasMVEFP)
327       setAllExpand(VT);
328 
329     // These are legal or custom whether we have MVE.fp or not
330     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
331     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
332     setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getVectorElementType(), Custom);
333     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
334     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
335     setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom);
336     setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);
337     setOperationAction(ISD::SETCC, VT, Custom);
338     setOperationAction(ISD::MLOAD, VT, Custom);
339     setOperationAction(ISD::MSTORE, VT, Legal);
340     setOperationAction(ISD::SELECT, VT, Expand);
341     setOperationAction(ISD::SELECT_CC, VT, Expand);
342 
343     // Pre and Post inc are supported on loads and stores
344     for (unsigned im = (unsigned)ISD::PRE_INC;
345          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
346       setIndexedLoadAction(im, VT, Legal);
347       setIndexedStoreAction(im, VT, Legal);
348       setIndexedMaskedLoadAction(im, VT, Legal);
349       setIndexedMaskedStoreAction(im, VT, Legal);
350     }
351 
352     if (HasMVEFP) {
353       setOperationAction(ISD::FMINNUM, VT, Legal);
354       setOperationAction(ISD::FMAXNUM, VT, Legal);
355       setOperationAction(ISD::FROUND, VT, Legal);
356       setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
357       setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
358       setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
359       setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
360 
361       // No native support for these.
362       setOperationAction(ISD::FDIV, VT, Expand);
363       setOperationAction(ISD::FREM, VT, Expand);
364       setOperationAction(ISD::FSQRT, VT, Expand);
365       setOperationAction(ISD::FSIN, VT, Expand);
366       setOperationAction(ISD::FCOS, VT, Expand);
367       setOperationAction(ISD::FPOW, VT, Expand);
368       setOperationAction(ISD::FLOG, VT, Expand);
369       setOperationAction(ISD::FLOG2, VT, Expand);
370       setOperationAction(ISD::FLOG10, VT, Expand);
371       setOperationAction(ISD::FEXP, VT, Expand);
372       setOperationAction(ISD::FEXP2, VT, Expand);
373       setOperationAction(ISD::FNEARBYINT, VT, Expand);
374     }
375   }
376 
377   // Custom Expand smaller than legal vector reductions to prevent false zero
378   // items being added.
379   setOperationAction(ISD::VECREDUCE_FADD, MVT::v4f16, Custom);
380   setOperationAction(ISD::VECREDUCE_FMUL, MVT::v4f16, Custom);
381   setOperationAction(ISD::VECREDUCE_FMIN, MVT::v4f16, Custom);
382   setOperationAction(ISD::VECREDUCE_FMAX, MVT::v4f16, Custom);
383   setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom);
384   setOperationAction(ISD::VECREDUCE_FMUL, MVT::v2f16, Custom);
385   setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom);
386   setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom);
387 
388   // We 'support' these types up to bitcast/load/store level, regardless of
389   // MVE integer-only / float support. Only doing FP data processing on the FP
390   // vector types is inhibited at integer-only level.
391   const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
392   for (auto VT : LongTypes) {
393     addRegisterClass(VT, &ARM::MQPRRegClass);
394     setAllExpand(VT);
395     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
396     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
397     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
398   }
399   // We can do bitwise operations on v2i64 vectors
400   setOperationAction(ISD::AND, MVT::v2i64, Legal);
401   setOperationAction(ISD::OR, MVT::v2i64, Legal);
402   setOperationAction(ISD::XOR, MVT::v2i64, Legal);
403 
404   // It is legal to extload from v4i8 to v4i16 or v4i32.
405   addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
406   addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
407   addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
408 
409   // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.
410   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8,  Legal);
411   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Legal);
412   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Legal);
413   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i8,  Legal);
414   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i16, Legal);
415 
416   // Some truncating stores are legal too.
417   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
418   setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
419   setTruncStoreAction(MVT::v8i16, MVT::v8i8,  Legal);
420 
421   // Pre and Post inc on these are legal, given the correct extends
422   for (unsigned im = (unsigned)ISD::PRE_INC;
423        im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
424     for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
425       setIndexedLoadAction(im, VT, Legal);
426       setIndexedStoreAction(im, VT, Legal);
427       setIndexedMaskedLoadAction(im, VT, Legal);
428       setIndexedMaskedStoreAction(im, VT, Legal);
429     }
430   }
431 
432   // Predicate types
433   const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1};
434   for (auto VT : pTypes) {
435     addRegisterClass(VT, &ARM::VCCRRegClass);
436     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
437     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
438     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
439     setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
440     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
441     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
442     setOperationAction(ISD::SETCC, VT, Custom);
443     setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
444     setOperationAction(ISD::LOAD, VT, Custom);
445     setOperationAction(ISD::STORE, VT, Custom);
446     setOperationAction(ISD::TRUNCATE, VT, Custom);
447     setOperationAction(ISD::VSELECT, VT, Expand);
448     setOperationAction(ISD::SELECT, VT, Expand);
449   }
450 }
451 
452 ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
453                                      const ARMSubtarget &STI)
454     : TargetLowering(TM), Subtarget(&STI) {
455   RegInfo = Subtarget->getRegisterInfo();
456   Itins = Subtarget->getInstrItineraryData();
457 
458   setBooleanContents(ZeroOrOneBooleanContent);
459   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
460 
461   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() &&
462       !Subtarget->isTargetWatchOS()) {
463     bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard;
464     for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID)
465       setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID),
466                             IsHFTarget ? CallingConv::ARM_AAPCS_VFP
467                                        : CallingConv::ARM_AAPCS);
468   }
469 
470   if (Subtarget->isTargetMachO()) {
471     // Uses VFP for Thumb libfuncs if available.
472     if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
473         Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
474       static const struct {
475         const RTLIB::Libcall Op;
476         const char * const Name;
477         const ISD::CondCode Cond;
478       } LibraryCalls[] = {
479         // Single-precision floating-point arithmetic.
480         { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },
481         { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },
482         { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },
483         { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },
484 
485         // Double-precision floating-point arithmetic.
486         { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },
487         { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },
488         { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },
489         { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },
490 
491         // Single-precision comparisons.
492         { RTLIB::OEQ_F32, "__eqsf2vfp",    ISD::SETNE },
493         { RTLIB::UNE_F32, "__nesf2vfp",    ISD::SETNE },
494         { RTLIB::OLT_F32, "__ltsf2vfp",    ISD::SETNE },
495         { RTLIB::OLE_F32, "__lesf2vfp",    ISD::SETNE },
496         { RTLIB::OGE_F32, "__gesf2vfp",    ISD::SETNE },
497         { RTLIB::OGT_F32, "__gtsf2vfp",    ISD::SETNE },
498         { RTLIB::UO_F32,  "__unordsf2vfp", ISD::SETNE },
499 
500         // Double-precision comparisons.
501         { RTLIB::OEQ_F64, "__eqdf2vfp",    ISD::SETNE },
502         { RTLIB::UNE_F64, "__nedf2vfp",    ISD::SETNE },
503         { RTLIB::OLT_F64, "__ltdf2vfp",    ISD::SETNE },
504         { RTLIB::OLE_F64, "__ledf2vfp",    ISD::SETNE },
505         { RTLIB::OGE_F64, "__gedf2vfp",    ISD::SETNE },
506         { RTLIB::OGT_F64, "__gtdf2vfp",    ISD::SETNE },
507         { RTLIB::UO_F64,  "__unorddf2vfp", ISD::SETNE },
508 
509         // Floating-point to integer conversions.
510         // i64 conversions are done via library routines even when generating VFP
511         // instructions, so use the same ones.
512         { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp",    ISD::SETCC_INVALID },
513         { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },
514         { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp",    ISD::SETCC_INVALID },
515         { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },
516 
517         // Conversions between floating types.
518         { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp",  ISD::SETCC_INVALID },
519         { RTLIB::FPEXT_F32_F64,   "__extendsfdf2vfp", ISD::SETCC_INVALID },
520 
521         // Integer to floating-point conversions.
522         // i64 conversions are done via library routines even when generating VFP
523         // instructions, so use the same ones.
524         // FIXME: There appears to be some naming inconsistency in ARM libgcc:
525         // e.g., __floatunsidf vs. __floatunssidfvfp.
526         { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp",    ISD::SETCC_INVALID },
527         { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },
528         { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp",    ISD::SETCC_INVALID },
529         { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },
530       };
531 
532       for (const auto &LC : LibraryCalls) {
533         setLibcallName(LC.Op, LC.Name);
534         if (LC.Cond != ISD::SETCC_INVALID)
535           setCmpLibcallCC(LC.Op, LC.Cond);
536       }
537     }
538   }
539 
540   // These libcalls are not available in 32-bit.
541   setLibcallName(RTLIB::SHL_I128, nullptr);
542   setLibcallName(RTLIB::SRL_I128, nullptr);
543   setLibcallName(RTLIB::SRA_I128, nullptr);
544   setLibcallName(RTLIB::MUL_I128, nullptr);
545 
546   // RTLIB
547   if (Subtarget->isAAPCS_ABI() &&
548       (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
549        Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {
550     static const struct {
551       const RTLIB::Libcall Op;
552       const char * const Name;
553       const CallingConv::ID CC;
554       const ISD::CondCode Cond;
555     } LibraryCalls[] = {
556       // Double-precision floating-point arithmetic helper functions
557       // RTABI chapter 4.1.2, Table 2
558       { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
559       { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
560       { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
561       { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
562 
563       // Double-precision floating-point comparison helper functions
564       // RTABI chapter 4.1.2, Table 3
565       { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
566       { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
567       { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
568       { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
569       { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
570       { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
571       { RTLIB::UO_F64,  "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
572 
573       // Single-precision floating-point arithmetic helper functions
574       // RTABI chapter 4.1.2, Table 4
575       { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
576       { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
577       { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
578       { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
579 
580       // Single-precision floating-point comparison helper functions
581       // RTABI chapter 4.1.2, Table 5
582       { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
583       { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
584       { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
585       { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
586       { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
587       { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
588       { RTLIB::UO_F32,  "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
589 
590       // Floating-point to integer conversions.
591       // RTABI chapter 4.1.2, Table 6
592       { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
593       { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
594       { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
595       { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
596       { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
597       { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
598       { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
599       { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
600 
601       // Conversions between floating types.
602       // RTABI chapter 4.1.2, Table 7
603       { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
604       { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
605       { RTLIB::FPEXT_F32_F64,   "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
606 
607       // Integer to floating-point conversions.
608       // RTABI chapter 4.1.2, Table 8
609       { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
610       { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
611       { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
612       { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
613       { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
614       { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
615       { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
616       { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
617 
618       // Long long helper functions
619       // RTABI chapter 4.2, Table 9
620       { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
621       { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
622       { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
623       { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
624 
625       // Integer division functions
626       // RTABI chapter 4.3.1
627       { RTLIB::SDIV_I8,  "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
628       { RTLIB::SDIV_I16, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
629       { RTLIB::SDIV_I32, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
630       { RTLIB::SDIV_I64, "__aeabi_ldivmod",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
631       { RTLIB::UDIV_I8,  "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
632       { RTLIB::UDIV_I16, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
633       { RTLIB::UDIV_I32, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
634       { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
635     };
636 
637     for (const auto &LC : LibraryCalls) {
638       setLibcallName(LC.Op, LC.Name);
639       setLibcallCallingConv(LC.Op, LC.CC);
640       if (LC.Cond != ISD::SETCC_INVALID)
641         setCmpLibcallCC(LC.Op, LC.Cond);
642     }
643 
644     // EABI dependent RTLIB
645     if (TM.Options.EABIVersion == EABI::EABI4 ||
646         TM.Options.EABIVersion == EABI::EABI5) {
647       static const struct {
648         const RTLIB::Libcall Op;
649         const char *const Name;
650         const CallingConv::ID CC;
651         const ISD::CondCode Cond;
652       } MemOpsLibraryCalls[] = {
653         // Memory operations
654         // RTABI chapter 4.3.4
655         { RTLIB::MEMCPY,  "__aeabi_memcpy",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
656         { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
657         { RTLIB::MEMSET,  "__aeabi_memset",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
658       };
659 
660       for (const auto &LC : MemOpsLibraryCalls) {
661         setLibcallName(LC.Op, LC.Name);
662         setLibcallCallingConv(LC.Op, LC.CC);
663         if (LC.Cond != ISD::SETCC_INVALID)
664           setCmpLibcallCC(LC.Op, LC.Cond);
665       }
666     }
667   }
668 
669   if (Subtarget->isTargetWindows()) {
670     static const struct {
671       const RTLIB::Libcall Op;
672       const char * const Name;
673       const CallingConv::ID CC;
674     } LibraryCalls[] = {
675       { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },
676       { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },
677       { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },
678       { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },
679       { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },
680       { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
681       { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
682       { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
683     };
684 
685     for (const auto &LC : LibraryCalls) {
686       setLibcallName(LC.Op, LC.Name);
687       setLibcallCallingConv(LC.Op, LC.CC);
688     }
689   }
690 
691   // Use divmod compiler-rt calls for iOS 5.0 and later.
692   if (Subtarget->isTargetMachO() &&
693       !(Subtarget->isTargetIOS() &&
694         Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
695     setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
696     setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
697   }
698 
699   // The half <-> float conversion functions are always soft-float on
700   // non-watchos platforms, but are needed for some targets which use a
701   // hard-float calling convention by default.
702   if (!Subtarget->isTargetWatchABI()) {
703     if (Subtarget->isAAPCS_ABI()) {
704       setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
705       setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
706       setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
707     } else {
708       setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
709       setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
710       setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
711     }
712   }
713 
714   // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
715   // a __gnu_ prefix (which is the default).
716   if (Subtarget->isTargetAEABI()) {
717     static const struct {
718       const RTLIB::Libcall Op;
719       const char * const Name;
720       const CallingConv::ID CC;
721     } LibraryCalls[] = {
722       { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS },
723       { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS },
724       { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS },
725     };
726 
727     for (const auto &LC : LibraryCalls) {
728       setLibcallName(LC.Op, LC.Name);
729       setLibcallCallingConv(LC.Op, LC.CC);
730     }
731   }
732 
733   if (Subtarget->isThumb1Only())
734     addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
735   else
736     addRegisterClass(MVT::i32, &ARM::GPRRegClass);
737 
738   if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
739       Subtarget->hasFPRegs()) {
740     addRegisterClass(MVT::f32, &ARM::SPRRegClass);
741     addRegisterClass(MVT::f64, &ARM::DPRRegClass);
742     if (!Subtarget->hasVFP2Base())
743       setAllExpand(MVT::f32);
744     if (!Subtarget->hasFP64())
745       setAllExpand(MVT::f64);
746   }
747 
748   if (Subtarget->hasFullFP16()) {
749     addRegisterClass(MVT::f16, &ARM::HPRRegClass);
750     setOperationAction(ISD::BITCAST, MVT::i16, Custom);
751     setOperationAction(ISD::BITCAST, MVT::f16, Custom);
752 
753     setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
754     setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
755   }
756 
757   if (Subtarget->hasBF16()) {
758     addRegisterClass(MVT::bf16, &ARM::HPRRegClass);
759     setAllExpand(MVT::bf16);
760     if (!Subtarget->hasFullFP16())
761       setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
762   }
763 
764   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
765     for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
766       setTruncStoreAction(VT, InnerVT, Expand);
767       addAllExtLoads(VT, InnerVT, Expand);
768     }
769 
770     setOperationAction(ISD::MULHS, VT, Expand);
771     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
772     setOperationAction(ISD::MULHU, VT, Expand);
773     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
774 
775     setOperationAction(ISD::BSWAP, VT, Expand);
776   }
777 
778   setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
779   setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
780 
781   setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom);
782   setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom);
783 
784   if (Subtarget->hasMVEIntegerOps())
785     addMVEVectorTypes(Subtarget->hasMVEFloatOps());
786 
787   // Combine low-overhead loop intrinsics so that we can lower i1 types.
788   if (Subtarget->hasLOB()) {
789     setTargetDAGCombine(ISD::BRCOND);
790     setTargetDAGCombine(ISD::BR_CC);
791   }
792 
793   if (Subtarget->hasNEON()) {
794     addDRTypeForNEON(MVT::v2f32);
795     addDRTypeForNEON(MVT::v8i8);
796     addDRTypeForNEON(MVT::v4i16);
797     addDRTypeForNEON(MVT::v2i32);
798     addDRTypeForNEON(MVT::v1i64);
799 
800     addQRTypeForNEON(MVT::v4f32);
801     addQRTypeForNEON(MVT::v2f64);
802     addQRTypeForNEON(MVT::v16i8);
803     addQRTypeForNEON(MVT::v8i16);
804     addQRTypeForNEON(MVT::v4i32);
805     addQRTypeForNEON(MVT::v2i64);
806 
807     if (Subtarget->hasFullFP16()) {
808       addQRTypeForNEON(MVT::v8f16);
809       addDRTypeForNEON(MVT::v4f16);
810     }
811 
812     if (Subtarget->hasBF16()) {
813       addQRTypeForNEON(MVT::v8bf16);
814       addDRTypeForNEON(MVT::v4bf16);
815     }
816   }
817 
818   if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
819     // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
820     // none of Neon, MVE or VFP supports any arithmetic operations on it.
821     setOperationAction(ISD::FADD, MVT::v2f64, Expand);
822     setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
823     setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
824     // FIXME: Code duplication: FDIV and FREM are expanded always, see
825     // ARMTargetLowering::addTypeForNEON method for details.
826     setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
827     setOperationAction(ISD::FREM, MVT::v2f64, Expand);
828     // FIXME: Create unittest.
829     // In another words, find a way when "copysign" appears in DAG with vector
830     // operands.
831     setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
832     // FIXME: Code duplication: SETCC has custom operation action, see
833     // ARMTargetLowering::addTypeForNEON method for details.
834     setOperationAction(ISD::SETCC, MVT::v2f64, Expand);
835     // FIXME: Create unittest for FNEG and for FABS.
836     setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
837     setOperationAction(ISD::FABS, MVT::v2f64, Expand);
838     setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
839     setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
840     setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
841     setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
842     setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
843     setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
844     setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
845     setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
846     setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
847     // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
848     setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
849     setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
850     setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
851     setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
852     setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
853     setOperationAction(ISD::FMA, MVT::v2f64, Expand);
854   }
855 
856   if (Subtarget->hasNEON()) {
857     // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
858     // supported for v4f32.
859     setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
860     setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
861     setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
862     setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
863     setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
864     setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
865     setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
866     setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
867     setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
868     setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
869     setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
870     setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
871     setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
872     setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
873 
874     // Mark v2f32 intrinsics.
875     setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
876     setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
877     setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
878     setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
879     setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
880     setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
881     setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
882     setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
883     setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
884     setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
885     setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
886     setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
887     setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
888     setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
889 
890     // Neon does not support some operations on v1i64 and v2i64 types.
891     setOperationAction(ISD::MUL, MVT::v1i64, Expand);
892     // Custom handling for some quad-vector types to detect VMULL.
893     setOperationAction(ISD::MUL, MVT::v8i16, Custom);
894     setOperationAction(ISD::MUL, MVT::v4i32, Custom);
895     setOperationAction(ISD::MUL, MVT::v2i64, Custom);
896     // Custom handling for some vector types to avoid expensive expansions
897     setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
898     setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
899     setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
900     setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
901     // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
902     // a destination type that is wider than the source, and nor does
903     // it have a FP_TO_[SU]INT instruction with a narrower destination than
904     // source.
905     setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
906     setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
907     setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
908     setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
909     setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
910     setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);
911     setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
912     setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
913 
914     setOperationAction(ISD::FP_ROUND,   MVT::v2f32, Expand);
915     setOperationAction(ISD::FP_EXTEND,  MVT::v2f64, Expand);
916 
917     // NEON does not have single instruction CTPOP for vectors with element
918     // types wider than 8-bits.  However, custom lowering can leverage the
919     // v8i8/v16i8 vcnt instruction.
920     setOperationAction(ISD::CTPOP,      MVT::v2i32, Custom);
921     setOperationAction(ISD::CTPOP,      MVT::v4i32, Custom);
922     setOperationAction(ISD::CTPOP,      MVT::v4i16, Custom);
923     setOperationAction(ISD::CTPOP,      MVT::v8i16, Custom);
924     setOperationAction(ISD::CTPOP,      MVT::v1i64, Custom);
925     setOperationAction(ISD::CTPOP,      MVT::v2i64, Custom);
926 
927     setOperationAction(ISD::CTLZ,       MVT::v1i64, Expand);
928     setOperationAction(ISD::CTLZ,       MVT::v2i64, Expand);
929 
930     // NEON does not have single instruction CTTZ for vectors.
931     setOperationAction(ISD::CTTZ, MVT::v8i8, Custom);
932     setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
933     setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
934     setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
935 
936     setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
937     setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
938     setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
939     setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
940 
941     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom);
942     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom);
943     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom);
944     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom);
945 
946     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom);
947     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom);
948     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
949     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
950 
951     // NEON only has FMA instructions as of VFP4.
952     if (!Subtarget->hasVFP4Base()) {
953       setOperationAction(ISD::FMA, MVT::v2f32, Expand);
954       setOperationAction(ISD::FMA, MVT::v4f32, Expand);
955     }
956 
957     setTargetDAGCombine(ISD::SHL);
958     setTargetDAGCombine(ISD::SRL);
959     setTargetDAGCombine(ISD::SRA);
960     setTargetDAGCombine(ISD::FP_TO_SINT);
961     setTargetDAGCombine(ISD::FP_TO_UINT);
962     setTargetDAGCombine(ISD::FDIV);
963     setTargetDAGCombine(ISD::LOAD);
964 
965     // It is legal to extload from v4i8 to v4i16 or v4i32.
966     for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
967                    MVT::v2i32}) {
968       for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
969         setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal);
970         setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal);
971         setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal);
972       }
973     }
974   }
975 
976   if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
977     setTargetDAGCombine(ISD::BUILD_VECTOR);
978     setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
979     setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
980     setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
981     setTargetDAGCombine(ISD::STORE);
982     setTargetDAGCombine(ISD::SIGN_EXTEND);
983     setTargetDAGCombine(ISD::ZERO_EXTEND);
984     setTargetDAGCombine(ISD::ANY_EXTEND);
985     setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
986     setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
987     setTargetDAGCombine(ISD::INTRINSIC_VOID);
988     setTargetDAGCombine(ISD::VECREDUCE_ADD);
989     setTargetDAGCombine(ISD::ADD);
990     setTargetDAGCombine(ISD::BITCAST);
991   }
992   if (Subtarget->hasMVEIntegerOps()) {
993     setTargetDAGCombine(ISD::SMIN);
994     setTargetDAGCombine(ISD::UMIN);
995     setTargetDAGCombine(ISD::SMAX);
996     setTargetDAGCombine(ISD::UMAX);
997     setTargetDAGCombine(ISD::FP_EXTEND);
998     setTargetDAGCombine(ISD::SELECT);
999     setTargetDAGCombine(ISD::SELECT_CC);
1000   }
1001 
1002   if (!Subtarget->hasFP64()) {
1003     // When targeting a floating-point unit with only single-precision
1004     // operations, f64 is legal for the few double-precision instructions which
1005     // are present However, no double-precision operations other than moves,
1006     // loads and stores are provided by the hardware.
1007     setOperationAction(ISD::FADD,       MVT::f64, Expand);
1008     setOperationAction(ISD::FSUB,       MVT::f64, Expand);
1009     setOperationAction(ISD::FMUL,       MVT::f64, Expand);
1010     setOperationAction(ISD::FMA,        MVT::f64, Expand);
1011     setOperationAction(ISD::FDIV,       MVT::f64, Expand);
1012     setOperationAction(ISD::FREM,       MVT::f64, Expand);
1013     setOperationAction(ISD::FCOPYSIGN,  MVT::f64, Expand);
1014     setOperationAction(ISD::FGETSIGN,   MVT::f64, Expand);
1015     setOperationAction(ISD::FNEG,       MVT::f64, Expand);
1016     setOperationAction(ISD::FABS,       MVT::f64, Expand);
1017     setOperationAction(ISD::FSQRT,      MVT::f64, Expand);
1018     setOperationAction(ISD::FSIN,       MVT::f64, Expand);
1019     setOperationAction(ISD::FCOS,       MVT::f64, Expand);
1020     setOperationAction(ISD::FPOW,       MVT::f64, Expand);
1021     setOperationAction(ISD::FLOG,       MVT::f64, Expand);
1022     setOperationAction(ISD::FLOG2,      MVT::f64, Expand);
1023     setOperationAction(ISD::FLOG10,     MVT::f64, Expand);
1024     setOperationAction(ISD::FEXP,       MVT::f64, Expand);
1025     setOperationAction(ISD::FEXP2,      MVT::f64, Expand);
1026     setOperationAction(ISD::FCEIL,      MVT::f64, Expand);
1027     setOperationAction(ISD::FTRUNC,     MVT::f64, Expand);
1028     setOperationAction(ISD::FRINT,      MVT::f64, Expand);
1029     setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
1030     setOperationAction(ISD::FFLOOR,     MVT::f64, Expand);
1031     setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
1032     setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
1033     setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
1034     setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
1035     setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom);
1036     setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom);
1037     setOperationAction(ISD::FP_ROUND,   MVT::f32, Custom);
1038     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
1039     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
1040     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::f64, Custom);
1041     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::f64, Custom);
1042     setOperationAction(ISD::STRICT_FP_ROUND,   MVT::f32, Custom);
1043   }
1044 
1045   if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
1046     setOperationAction(ISD::FP_EXTEND,  MVT::f64, Custom);
1047     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
1048     if (Subtarget->hasFullFP16()) {
1049       setOperationAction(ISD::FP_ROUND,  MVT::f16, Custom);
1050       setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
1051     }
1052   }
1053 
1054   if (!Subtarget->hasFP16()) {
1055     setOperationAction(ISD::FP_EXTEND,  MVT::f32, Custom);
1056     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);
1057   }
1058 
1059   computeRegisterProperties(Subtarget->getRegisterInfo());
1060 
1061   // ARM does not have floating-point extending loads.
1062   for (MVT VT : MVT::fp_valuetypes()) {
1063     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1064     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1065   }
1066 
1067   // ... or truncating stores
1068   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
1069   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
1070   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
1071 
1072   // ARM does not have i1 sign extending load.
1073   for (MVT VT : MVT::integer_valuetypes())
1074     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
1075 
1076   // ARM supports all 4 flavors of integer indexed load / store.
1077   if (!Subtarget->isThumb1Only()) {
1078     for (unsigned im = (unsigned)ISD::PRE_INC;
1079          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
1080       setIndexedLoadAction(im,  MVT::i1,  Legal);
1081       setIndexedLoadAction(im,  MVT::i8,  Legal);
1082       setIndexedLoadAction(im,  MVT::i16, Legal);
1083       setIndexedLoadAction(im,  MVT::i32, Legal);
1084       setIndexedStoreAction(im, MVT::i1,  Legal);
1085       setIndexedStoreAction(im, MVT::i8,  Legal);
1086       setIndexedStoreAction(im, MVT::i16, Legal);
1087       setIndexedStoreAction(im, MVT::i32, Legal);
1088     }
1089   } else {
1090     // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
1091     setIndexedLoadAction(ISD::POST_INC, MVT::i32,  Legal);
1092     setIndexedStoreAction(ISD::POST_INC, MVT::i32,  Legal);
1093   }
1094 
1095   setOperationAction(ISD::SADDO, MVT::i32, Custom);
1096   setOperationAction(ISD::UADDO, MVT::i32, Custom);
1097   setOperationAction(ISD::SSUBO, MVT::i32, Custom);
1098   setOperationAction(ISD::USUBO, MVT::i32, Custom);
1099 
1100   setOperationAction(ISD::ADDCARRY, MVT::i32, Custom);
1101   setOperationAction(ISD::SUBCARRY, MVT::i32, Custom);
1102   if (Subtarget->hasDSP()) {
1103     setOperationAction(ISD::SADDSAT, MVT::i8, Custom);
1104     setOperationAction(ISD::SSUBSAT, MVT::i8, Custom);
1105     setOperationAction(ISD::SADDSAT, MVT::i16, Custom);
1106     setOperationAction(ISD::SSUBSAT, MVT::i16, Custom);
1107   }
1108   if (Subtarget->hasBaseDSP()) {
1109     setOperationAction(ISD::SADDSAT, MVT::i32, Legal);
1110     setOperationAction(ISD::SSUBSAT, MVT::i32, Legal);
1111   }
1112 
1113   // i64 operation support.
1114   setOperationAction(ISD::MUL,     MVT::i64, Expand);
1115   setOperationAction(ISD::MULHU,   MVT::i32, Expand);
1116   if (Subtarget->isThumb1Only()) {
1117     setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
1118     setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
1119   }
1120   if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
1121       || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
1122     setOperationAction(ISD::MULHS, MVT::i32, Expand);
1123 
1124   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
1125   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
1126   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
1127   setOperationAction(ISD::SRL,       MVT::i64, Custom);
1128   setOperationAction(ISD::SRA,       MVT::i64, Custom);
1129   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1130   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1131   setOperationAction(ISD::LOAD, MVT::i64, Custom);
1132   setOperationAction(ISD::STORE, MVT::i64, Custom);
1133 
1134   // MVE lowers 64 bit shifts to lsll and lsrl
1135   // assuming that ISD::SRL and SRA of i64 are already marked custom
1136   if (Subtarget->hasMVEIntegerOps())
1137     setOperationAction(ISD::SHL, MVT::i64, Custom);
1138 
1139   // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
1140   if (Subtarget->isThumb1Only()) {
1141     setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
1142     setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
1143     setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
1144   }
1145 
1146   if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
1147     setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
1148 
1149   // ARM does not have ROTL.
1150   setOperationAction(ISD::ROTL, MVT::i32, Expand);
1151   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1152     setOperationAction(ISD::ROTL, VT, Expand);
1153     setOperationAction(ISD::ROTR, VT, Expand);
1154   }
1155   setOperationAction(ISD::CTTZ,  MVT::i32, Custom);
1156   setOperationAction(ISD::CTPOP, MVT::i32, Expand);
1157   if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
1158     setOperationAction(ISD::CTLZ, MVT::i32, Expand);
1159     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall);
1160   }
1161 
1162   // @llvm.readcyclecounter requires the Performance Monitors extension.
1163   // Default to the 0 expansion on unsupported platforms.
1164   // FIXME: Technically there are older ARM CPUs that have
1165   // implementation-specific ways of obtaining this information.
1166   if (Subtarget->hasPerfMon())
1167     setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
1168 
1169   // Only ARMv6 has BSWAP.
1170   if (!Subtarget->hasV6Ops())
1171     setOperationAction(ISD::BSWAP, MVT::i32, Expand);
1172 
1173   bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
1174                                         : Subtarget->hasDivideInARMMode();
1175   if (!hasDivide) {
1176     // These are expanded into libcalls if the cpu doesn't have HW divider.
1177     setOperationAction(ISD::SDIV,  MVT::i32, LibCall);
1178     setOperationAction(ISD::UDIV,  MVT::i32, LibCall);
1179   }
1180 
1181   if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) {
1182     setOperationAction(ISD::SDIV, MVT::i32, Custom);
1183     setOperationAction(ISD::UDIV, MVT::i32, Custom);
1184 
1185     setOperationAction(ISD::SDIV, MVT::i64, Custom);
1186     setOperationAction(ISD::UDIV, MVT::i64, Custom);
1187   }
1188 
1189   setOperationAction(ISD::SREM,  MVT::i32, Expand);
1190   setOperationAction(ISD::UREM,  MVT::i32, Expand);
1191 
1192   // Register based DivRem for AEABI (RTABI 4.2)
1193   if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
1194       Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
1195       Subtarget->isTargetWindows()) {
1196     setOperationAction(ISD::SREM, MVT::i64, Custom);
1197     setOperationAction(ISD::UREM, MVT::i64, Custom);
1198     HasStandaloneRem = false;
1199 
1200     if (Subtarget->isTargetWindows()) {
1201       const struct {
1202         const RTLIB::Libcall Op;
1203         const char * const Name;
1204         const CallingConv::ID CC;
1205       } LibraryCalls[] = {
1206         { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS },
1207         { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS },
1208         { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS },
1209         { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS },
1210 
1211         { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS },
1212         { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS },
1213         { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS },
1214         { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS },
1215       };
1216 
1217       for (const auto &LC : LibraryCalls) {
1218         setLibcallName(LC.Op, LC.Name);
1219         setLibcallCallingConv(LC.Op, LC.CC);
1220       }
1221     } else {
1222       const struct {
1223         const RTLIB::Libcall Op;
1224         const char * const Name;
1225         const CallingConv::ID CC;
1226       } LibraryCalls[] = {
1227         { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1228         { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1229         { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1230         { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS },
1231 
1232         { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1233         { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1234         { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1235         { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS },
1236       };
1237 
1238       for (const auto &LC : LibraryCalls) {
1239         setLibcallName(LC.Op, LC.Name);
1240         setLibcallCallingConv(LC.Op, LC.CC);
1241       }
1242     }
1243 
1244     setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
1245     setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
1246     setOperationAction(ISD::SDIVREM, MVT::i64, Custom);
1247     setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
1248   } else {
1249     setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
1250     setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
1251   }
1252 
1253   if (Subtarget->getTargetTriple().isOSMSVCRT()) {
1254     // MSVCRT doesn't have powi; fall back to pow
1255     setLibcallName(RTLIB::POWI_F32, nullptr);
1256     setLibcallName(RTLIB::POWI_F64, nullptr);
1257   }
1258 
1259   setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
1260   setOperationAction(ISD::ConstantPool,  MVT::i32,   Custom);
1261   setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
1262   setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
1263 
1264   setOperationAction(ISD::TRAP, MVT::Other, Legal);
1265   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
1266 
1267   // Use the default implementation.
1268   setOperationAction(ISD::VASTART,            MVT::Other, Custom);
1269   setOperationAction(ISD::VAARG,              MVT::Other, Expand);
1270   setOperationAction(ISD::VACOPY,             MVT::Other, Expand);
1271   setOperationAction(ISD::VAEND,              MVT::Other, Expand);
1272   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
1273   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
1274 
1275   if (Subtarget->isTargetWindows())
1276     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
1277   else
1278     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
1279 
1280   // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
1281   // the default expansion.
1282   InsertFencesForAtomic = false;
1283   if (Subtarget->hasAnyDataBarrier() &&
1284       (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
1285     // ATOMIC_FENCE needs custom lowering; the others should have been expanded
1286     // to ldrex/strex loops already.
1287     setOperationAction(ISD::ATOMIC_FENCE,     MVT::Other, Custom);
1288     if (!Subtarget->isThumb() || !Subtarget->isMClass())
1289       setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Custom);
1290 
1291     // On v8, we have particularly efficient implementations of atomic fences
1292     // if they can be combined with nearby atomic loads and stores.
1293     if (!Subtarget->hasAcquireRelease() ||
1294         getTargetMachine().getOptLevel() == 0) {
1295       // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
1296       InsertFencesForAtomic = true;
1297     }
1298   } else {
1299     // If there's anything we can use as a barrier, go through custom lowering
1300     // for ATOMIC_FENCE.
1301     // If target has DMB in thumb, Fences can be inserted.
1302     if (Subtarget->hasDataBarrier())
1303       InsertFencesForAtomic = true;
1304 
1305     setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other,
1306                        Subtarget->hasAnyDataBarrier() ? Custom : Expand);
1307 
1308     // Set them all for expansion, which will force libcalls.
1309     setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Expand);
1310     setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Expand);
1311     setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Expand);
1312     setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i32, Expand);
1313     setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i32, Expand);
1314     setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i32, Expand);
1315     setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i32, Expand);
1316     setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
1317     setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
1318     setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
1319     setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
1320     setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
1321     // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1322     // Unordered/Monotonic case.
1323     if (!InsertFencesForAtomic) {
1324       setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
1325       setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
1326     }
1327   }
1328 
1329   setOperationAction(ISD::PREFETCH,         MVT::Other, Custom);
1330 
1331   // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1332   if (!Subtarget->hasV6Ops()) {
1333     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
1334     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
1335   }
1336   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
1337 
1338   if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
1339       !Subtarget->isThumb1Only()) {
1340     // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1341     // iff target supports vfp2.
1342     setOperationAction(ISD::BITCAST, MVT::i64, Custom);
1343     setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
1344   }
1345 
1346   // We want to custom lower some of our intrinsics.
1347   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1348   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
1349   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
1350   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
1351   if (Subtarget->useSjLjEH())
1352     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
1353 
1354   setOperationAction(ISD::SETCC,     MVT::i32, Expand);
1355   setOperationAction(ISD::SETCC,     MVT::f32, Expand);
1356   setOperationAction(ISD::SETCC,     MVT::f64, Expand);
1357   setOperationAction(ISD::SELECT,    MVT::i32, Custom);
1358   setOperationAction(ISD::SELECT,    MVT::f32, Custom);
1359   setOperationAction(ISD::SELECT,    MVT::f64, Custom);
1360   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
1361   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
1362   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
1363   if (Subtarget->hasFullFP16()) {
1364     setOperationAction(ISD::SETCC,     MVT::f16, Expand);
1365     setOperationAction(ISD::SELECT,    MVT::f16, Custom);
1366     setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
1367   }
1368 
1369   setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom);
1370 
1371   setOperationAction(ISD::BRCOND,    MVT::Other, Custom);
1372   setOperationAction(ISD::BR_CC,     MVT::i32,   Custom);
1373   if (Subtarget->hasFullFP16())
1374       setOperationAction(ISD::BR_CC, MVT::f16,   Custom);
1375   setOperationAction(ISD::BR_CC,     MVT::f32,   Custom);
1376   setOperationAction(ISD::BR_CC,     MVT::f64,   Custom);
1377   setOperationAction(ISD::BR_JT,     MVT::Other, Custom);
1378 
1379   // We don't support sin/cos/fmod/copysign/pow
1380   setOperationAction(ISD::FSIN,      MVT::f64, Expand);
1381   setOperationAction(ISD::FSIN,      MVT::f32, Expand);
1382   setOperationAction(ISD::FCOS,      MVT::f32, Expand);
1383   setOperationAction(ISD::FCOS,      MVT::f64, Expand);
1384   setOperationAction(ISD::FSINCOS,   MVT::f64, Expand);
1385   setOperationAction(ISD::FSINCOS,   MVT::f32, Expand);
1386   setOperationAction(ISD::FREM,      MVT::f64, Expand);
1387   setOperationAction(ISD::FREM,      MVT::f32, Expand);
1388   if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
1389       !Subtarget->isThumb1Only()) {
1390     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
1391     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
1392   }
1393   setOperationAction(ISD::FPOW,      MVT::f64, Expand);
1394   setOperationAction(ISD::FPOW,      MVT::f32, Expand);
1395 
1396   if (!Subtarget->hasVFP4Base()) {
1397     setOperationAction(ISD::FMA, MVT::f64, Expand);
1398     setOperationAction(ISD::FMA, MVT::f32, Expand);
1399   }
1400 
1401   // Various VFP goodness
1402   if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1403     // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1404     if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
1405       setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
1406       setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
1407     }
1408 
1409     // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1410     if (!Subtarget->hasFP16()) {
1411       setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
1412       setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
1413     }
1414 
1415     // Strict floating-point comparisons need custom lowering.
1416     setOperationAction(ISD::STRICT_FSETCC,  MVT::f16, Custom);
1417     setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
1418     setOperationAction(ISD::STRICT_FSETCC,  MVT::f32, Custom);
1419     setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom);
1420     setOperationAction(ISD::STRICT_FSETCC,  MVT::f64, Custom);
1421     setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
1422   }
1423 
1424   // Use __sincos_stret if available.
1425   if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1426       getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1427     setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1428     setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1429   }
1430 
1431   // FP-ARMv8 implements a lot of rounding-like FP operations.
1432   if (Subtarget->hasFPARMv8Base()) {
1433     setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
1434     setOperationAction(ISD::FCEIL, MVT::f32, Legal);
1435     setOperationAction(ISD::FROUND, MVT::f32, Legal);
1436     setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
1437     setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
1438     setOperationAction(ISD::FRINT, MVT::f32, Legal);
1439     setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
1440     setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
1441     if (Subtarget->hasNEON()) {
1442       setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
1443       setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
1444       setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
1445       setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
1446     }
1447 
1448     if (Subtarget->hasFP64()) {
1449       setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
1450       setOperationAction(ISD::FCEIL, MVT::f64, Legal);
1451       setOperationAction(ISD::FROUND, MVT::f64, Legal);
1452       setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
1453       setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
1454       setOperationAction(ISD::FRINT, MVT::f64, Legal);
1455       setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
1456       setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
1457     }
1458   }
1459 
1460   // FP16 often need to be promoted to call lib functions
1461   if (Subtarget->hasFullFP16()) {
1462     setOperationAction(ISD::FREM, MVT::f16, Promote);
1463     setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
1464     setOperationAction(ISD::FSIN, MVT::f16, Promote);
1465     setOperationAction(ISD::FCOS, MVT::f16, Promote);
1466     setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
1467     setOperationAction(ISD::FPOWI, MVT::f16, Promote);
1468     setOperationAction(ISD::FPOW, MVT::f16, Promote);
1469     setOperationAction(ISD::FEXP, MVT::f16, Promote);
1470     setOperationAction(ISD::FEXP2, MVT::f16, Promote);
1471     setOperationAction(ISD::FLOG, MVT::f16, Promote);
1472     setOperationAction(ISD::FLOG10, MVT::f16, Promote);
1473     setOperationAction(ISD::FLOG2, MVT::f16, Promote);
1474 
1475     setOperationAction(ISD::FROUND, MVT::f16, Legal);
1476   }
1477 
1478   if (Subtarget->hasNEON()) {
1479     // vmin and vmax aren't available in a scalar form, so we can use
1480     // a NEON instruction with an undef lane instead.  This has a performance
1481     // penalty on some cores, so we don't do this unless we have been
1482     // asked to by the core tuning model.
1483     if (Subtarget->useNEONForSinglePrecisionFP()) {
1484       setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
1485       setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
1486       setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
1487       setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
1488     }
1489     setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal);
1490     setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal);
1491     setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
1492     setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal);
1493 
1494     if (Subtarget->hasFullFP16()) {
1495       setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal);
1496       setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal);
1497       setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal);
1498       setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal);
1499 
1500       setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal);
1501       setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal);
1502       setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal);
1503       setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal);
1504     }
1505   }
1506 
1507   // We have target-specific dag combine patterns for the following nodes:
1508   // ARMISD::VMOVRRD  - No need to call setTargetDAGCombine
1509   setTargetDAGCombine(ISD::ADD);
1510   setTargetDAGCombine(ISD::SUB);
1511   setTargetDAGCombine(ISD::MUL);
1512   setTargetDAGCombine(ISD::AND);
1513   setTargetDAGCombine(ISD::OR);
1514   setTargetDAGCombine(ISD::XOR);
1515 
1516   if (Subtarget->hasMVEIntegerOps())
1517     setTargetDAGCombine(ISD::VSELECT);
1518 
1519   if (Subtarget->hasV6Ops())
1520     setTargetDAGCombine(ISD::SRL);
1521   if (Subtarget->isThumb1Only())
1522     setTargetDAGCombine(ISD::SHL);
1523 
1524   setStackPointerRegisterToSaveRestore(ARM::SP);
1525 
1526   if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1527       !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
1528     setSchedulingPreference(Sched::RegPressure);
1529   else
1530     setSchedulingPreference(Sched::Hybrid);
1531 
1532   //// temporary - rewrite interface to use type
1533   MaxStoresPerMemset = 8;
1534   MaxStoresPerMemsetOptSize = 4;
1535   MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1536   MaxStoresPerMemcpyOptSize = 2;
1537   MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1538   MaxStoresPerMemmoveOptSize = 2;
1539 
1540   // On ARM arguments smaller than 4 bytes are extended, so all arguments
1541   // are at least 4 bytes aligned.
1542   setMinStackArgumentAlignment(Align(4));
1543 
1544   // Prefer likely predicted branches to selects on out-of-order cores.
1545   PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1546 
1547   setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment()));
1548 
1549   setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
1550 
1551   if (Subtarget->isThumb() || Subtarget->isThumb2())
1552     setTargetDAGCombine(ISD::ABS);
1553 }
1554 
1555 bool ARMTargetLowering::useSoftFloat() const {
1556   return Subtarget->useSoftFloat();
1557 }
1558 
1559 // FIXME: It might make sense to define the representative register class as the
1560 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1561 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1562 // SPR's representative would be DPR_VFP2. This should work well if register
1563 // pressure tracking were modified such that a register use would increment the
1564 // pressure of the register class's representative and all of it's super
1565 // classes' representatives transitively. We have not implemented this because
1566 // of the difficulty prior to coalescing of modeling operand register classes
1567 // due to the common occurrence of cross class copies and subregister insertions
1568 // and extractions.
1569 std::pair<const TargetRegisterClass *, uint8_t>
1570 ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1571                                            MVT VT) const {
1572   const TargetRegisterClass *RRC = nullptr;
1573   uint8_t Cost = 1;
1574   switch (VT.SimpleTy) {
1575   default:
1576     return TargetLowering::findRepresentativeClass(TRI, VT);
1577   // Use DPR as representative register class for all floating point
1578   // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1579   // the cost is 1 for both f32 and f64.
1580   case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1581   case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1582     RRC = &ARM::DPRRegClass;
1583     // When NEON is used for SP, only half of the register file is available
1584     // because operations that define both SP and DP results will be constrained
1585     // to the VFP2 class (D0-D15). We currently model this constraint prior to
1586     // coalescing by double-counting the SP regs. See the FIXME above.
1587     if (Subtarget->useNEONForSinglePrecisionFP())
1588       Cost = 2;
1589     break;
1590   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1591   case MVT::v4f32: case MVT::v2f64:
1592     RRC = &ARM::DPRRegClass;
1593     Cost = 2;
1594     break;
1595   case MVT::v4i64:
1596     RRC = &ARM::DPRRegClass;
1597     Cost = 4;
1598     break;
1599   case MVT::v8i64:
1600     RRC = &ARM::DPRRegClass;
1601     Cost = 8;
1602     break;
1603   }
1604   return std::make_pair(RRC, Cost);
1605 }
1606 
1607 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
1608   switch ((ARMISD::NodeType)Opcode) {
1609   case ARMISD::FIRST_NUMBER:  break;
1610   case ARMISD::Wrapper:       return "ARMISD::Wrapper";
1611   case ARMISD::WrapperPIC:    return "ARMISD::WrapperPIC";
1612   case ARMISD::WrapperJT:     return "ARMISD::WrapperJT";
1613   case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL";
1614   case ARMISD::CALL:          return "ARMISD::CALL";
1615   case ARMISD::CALL_PRED:     return "ARMISD::CALL_PRED";
1616   case ARMISD::CALL_NOLINK:   return "ARMISD::CALL_NOLINK";
1617   case ARMISD::tSECALL:       return "ARMISD::tSECALL";
1618   case ARMISD::BRCOND:        return "ARMISD::BRCOND";
1619   case ARMISD::BR_JT:         return "ARMISD::BR_JT";
1620   case ARMISD::BR2_JT:        return "ARMISD::BR2_JT";
1621   case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
1622   case ARMISD::SERET_FLAG:    return "ARMISD::SERET_FLAG";
1623   case ARMISD::INTRET_FLAG:   return "ARMISD::INTRET_FLAG";
1624   case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
1625   case ARMISD::CMP:           return "ARMISD::CMP";
1626   case ARMISD::CMN:           return "ARMISD::CMN";
1627   case ARMISD::CMPZ:          return "ARMISD::CMPZ";
1628   case ARMISD::CMPFP:         return "ARMISD::CMPFP";
1629   case ARMISD::CMPFPE:        return "ARMISD::CMPFPE";
1630   case ARMISD::CMPFPw0:       return "ARMISD::CMPFPw0";
1631   case ARMISD::CMPFPEw0:      return "ARMISD::CMPFPEw0";
1632   case ARMISD::BCC_i64:       return "ARMISD::BCC_i64";
1633   case ARMISD::FMSTAT:        return "ARMISD::FMSTAT";
1634 
1635   case ARMISD::CMOV:          return "ARMISD::CMOV";
1636   case ARMISD::SUBS:          return "ARMISD::SUBS";
1637 
1638   case ARMISD::SSAT:          return "ARMISD::SSAT";
1639   case ARMISD::USAT:          return "ARMISD::USAT";
1640 
1641   case ARMISD::ASRL:          return "ARMISD::ASRL";
1642   case ARMISD::LSRL:          return "ARMISD::LSRL";
1643   case ARMISD::LSLL:          return "ARMISD::LSLL";
1644 
1645   case ARMISD::SRL_FLAG:      return "ARMISD::SRL_FLAG";
1646   case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
1647   case ARMISD::RRX:           return "ARMISD::RRX";
1648 
1649   case ARMISD::ADDC:          return "ARMISD::ADDC";
1650   case ARMISD::ADDE:          return "ARMISD::ADDE";
1651   case ARMISD::SUBC:          return "ARMISD::SUBC";
1652   case ARMISD::SUBE:          return "ARMISD::SUBE";
1653   case ARMISD::LSLS:          return "ARMISD::LSLS";
1654 
1655   case ARMISD::VMOVRRD:       return "ARMISD::VMOVRRD";
1656   case ARMISD::VMOVDRR:       return "ARMISD::VMOVDRR";
1657   case ARMISD::VMOVhr:        return "ARMISD::VMOVhr";
1658   case ARMISD::VMOVrh:        return "ARMISD::VMOVrh";
1659   case ARMISD::VMOVSR:        return "ARMISD::VMOVSR";
1660 
1661   case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
1662   case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP";
1663   case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH";
1664 
1665   case ARMISD::TC_RETURN:     return "ARMISD::TC_RETURN";
1666 
1667   case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
1668 
1669   case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
1670 
1671   case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR";
1672 
1673   case ARMISD::PRELOAD:       return "ARMISD::PRELOAD";
1674 
1675   case ARMISD::LDRD:          return "ARMISD::LDRD";
1676   case ARMISD::STRD:          return "ARMISD::STRD";
1677 
1678   case ARMISD::WIN__CHKSTK:   return "ARMISD::WIN__CHKSTK";
1679   case ARMISD::WIN__DBZCHK:   return "ARMISD::WIN__DBZCHK";
1680 
1681   case ARMISD::PREDICATE_CAST: return "ARMISD::PREDICATE_CAST";
1682   case ARMISD::VECTOR_REG_CAST: return "ARMISD::VECTOR_REG_CAST";
1683   case ARMISD::VCMP:          return "ARMISD::VCMP";
1684   case ARMISD::VCMPZ:         return "ARMISD::VCMPZ";
1685   case ARMISD::VTST:          return "ARMISD::VTST";
1686 
1687   case ARMISD::VSHLs:         return "ARMISD::VSHLs";
1688   case ARMISD::VSHLu:         return "ARMISD::VSHLu";
1689   case ARMISD::VSHLIMM:       return "ARMISD::VSHLIMM";
1690   case ARMISD::VSHRsIMM:      return "ARMISD::VSHRsIMM";
1691   case ARMISD::VSHRuIMM:      return "ARMISD::VSHRuIMM";
1692   case ARMISD::VRSHRsIMM:     return "ARMISD::VRSHRsIMM";
1693   case ARMISD::VRSHRuIMM:     return "ARMISD::VRSHRuIMM";
1694   case ARMISD::VRSHRNIMM:     return "ARMISD::VRSHRNIMM";
1695   case ARMISD::VQSHLsIMM:     return "ARMISD::VQSHLsIMM";
1696   case ARMISD::VQSHLuIMM:     return "ARMISD::VQSHLuIMM";
1697   case ARMISD::VQSHLsuIMM:    return "ARMISD::VQSHLsuIMM";
1698   case ARMISD::VQSHRNsIMM:    return "ARMISD::VQSHRNsIMM";
1699   case ARMISD::VQSHRNuIMM:    return "ARMISD::VQSHRNuIMM";
1700   case ARMISD::VQSHRNsuIMM:   return "ARMISD::VQSHRNsuIMM";
1701   case ARMISD::VQRSHRNsIMM:   return "ARMISD::VQRSHRNsIMM";
1702   case ARMISD::VQRSHRNuIMM:   return "ARMISD::VQRSHRNuIMM";
1703   case ARMISD::VQRSHRNsuIMM:  return "ARMISD::VQRSHRNsuIMM";
1704   case ARMISD::VSLIIMM:       return "ARMISD::VSLIIMM";
1705   case ARMISD::VSRIIMM:       return "ARMISD::VSRIIMM";
1706   case ARMISD::VGETLANEu:     return "ARMISD::VGETLANEu";
1707   case ARMISD::VGETLANEs:     return "ARMISD::VGETLANEs";
1708   case ARMISD::VMOVIMM:       return "ARMISD::VMOVIMM";
1709   case ARMISD::VMVNIMM:       return "ARMISD::VMVNIMM";
1710   case ARMISD::VMOVFPIMM:     return "ARMISD::VMOVFPIMM";
1711   case ARMISD::VDUP:          return "ARMISD::VDUP";
1712   case ARMISD::VDUPLANE:      return "ARMISD::VDUPLANE";
1713   case ARMISD::VEXT:          return "ARMISD::VEXT";
1714   case ARMISD::VREV64:        return "ARMISD::VREV64";
1715   case ARMISD::VREV32:        return "ARMISD::VREV32";
1716   case ARMISD::VREV16:        return "ARMISD::VREV16";
1717   case ARMISD::VZIP:          return "ARMISD::VZIP";
1718   case ARMISD::VUZP:          return "ARMISD::VUZP";
1719   case ARMISD::VTRN:          return "ARMISD::VTRN";
1720   case ARMISD::VTBL1:         return "ARMISD::VTBL1";
1721   case ARMISD::VTBL2:         return "ARMISD::VTBL2";
1722   case ARMISD::VMOVN:         return "ARMISD::VMOVN";
1723   case ARMISD::VQMOVNs:       return "ARMISD::VQMOVNs";
1724   case ARMISD::VQMOVNu:       return "ARMISD::VQMOVNu";
1725   case ARMISD::VCVTN:         return "ARMISD::VCVTN";
1726   case ARMISD::VCVTL:         return "ARMISD::VCVTL";
1727   case ARMISD::VMULLs:        return "ARMISD::VMULLs";
1728   case ARMISD::VMULLu:        return "ARMISD::VMULLu";
1729   case ARMISD::VQDMULH:       return "ARMISD::VQDMULH";
1730   case ARMISD::VADDVs:        return "ARMISD::VADDVs";
1731   case ARMISD::VADDVu:        return "ARMISD::VADDVu";
1732   case ARMISD::VADDVps:       return "ARMISD::VADDVps";
1733   case ARMISD::VADDVpu:       return "ARMISD::VADDVpu";
1734   case ARMISD::VADDLVs:       return "ARMISD::VADDLVs";
1735   case ARMISD::VADDLVu:       return "ARMISD::VADDLVu";
1736   case ARMISD::VADDLVAs:      return "ARMISD::VADDLVAs";
1737   case ARMISD::VADDLVAu:      return "ARMISD::VADDLVAu";
1738   case ARMISD::VADDLVps:      return "ARMISD::VADDLVps";
1739   case ARMISD::VADDLVpu:      return "ARMISD::VADDLVpu";
1740   case ARMISD::VADDLVAps:     return "ARMISD::VADDLVAps";
1741   case ARMISD::VADDLVApu:     return "ARMISD::VADDLVApu";
1742   case ARMISD::VMLAVs:        return "ARMISD::VMLAVs";
1743   case ARMISD::VMLAVu:        return "ARMISD::VMLAVu";
1744   case ARMISD::VMLAVps:       return "ARMISD::VMLAVps";
1745   case ARMISD::VMLAVpu:       return "ARMISD::VMLAVpu";
1746   case ARMISD::VMLALVs:       return "ARMISD::VMLALVs";
1747   case ARMISD::VMLALVu:       return "ARMISD::VMLALVu";
1748   case ARMISD::VMLALVps:      return "ARMISD::VMLALVps";
1749   case ARMISD::VMLALVpu:      return "ARMISD::VMLALVpu";
1750   case ARMISD::VMLALVAs:      return "ARMISD::VMLALVAs";
1751   case ARMISD::VMLALVAu:      return "ARMISD::VMLALVAu";
1752   case ARMISD::VMLALVAps:     return "ARMISD::VMLALVAps";
1753   case ARMISD::VMLALVApu:     return "ARMISD::VMLALVApu";
1754   case ARMISD::VMINVu:        return "ARMISD::VMINVu";
1755   case ARMISD::VMINVs:        return "ARMISD::VMINVs";
1756   case ARMISD::VMAXVu:        return "ARMISD::VMAXVu";
1757   case ARMISD::VMAXVs:        return "ARMISD::VMAXVs";
1758   case ARMISD::UMAAL:         return "ARMISD::UMAAL";
1759   case ARMISD::UMLAL:         return "ARMISD::UMLAL";
1760   case ARMISD::SMLAL:         return "ARMISD::SMLAL";
1761   case ARMISD::SMLALBB:       return "ARMISD::SMLALBB";
1762   case ARMISD::SMLALBT:       return "ARMISD::SMLALBT";
1763   case ARMISD::SMLALTB:       return "ARMISD::SMLALTB";
1764   case ARMISD::SMLALTT:       return "ARMISD::SMLALTT";
1765   case ARMISD::SMULWB:        return "ARMISD::SMULWB";
1766   case ARMISD::SMULWT:        return "ARMISD::SMULWT";
1767   case ARMISD::SMLALD:        return "ARMISD::SMLALD";
1768   case ARMISD::SMLALDX:       return "ARMISD::SMLALDX";
1769   case ARMISD::SMLSLD:        return "ARMISD::SMLSLD";
1770   case ARMISD::SMLSLDX:       return "ARMISD::SMLSLDX";
1771   case ARMISD::SMMLAR:        return "ARMISD::SMMLAR";
1772   case ARMISD::SMMLSR:        return "ARMISD::SMMLSR";
1773   case ARMISD::QADD16b:       return "ARMISD::QADD16b";
1774   case ARMISD::QSUB16b:       return "ARMISD::QSUB16b";
1775   case ARMISD::QADD8b:        return "ARMISD::QADD8b";
1776   case ARMISD::QSUB8b:        return "ARMISD::QSUB8b";
1777   case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
1778   case ARMISD::BFI:           return "ARMISD::BFI";
1779   case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
1780   case ARMISD::VBICIMM:       return "ARMISD::VBICIMM";
1781   case ARMISD::VBSP:          return "ARMISD::VBSP";
1782   case ARMISD::MEMCPY:        return "ARMISD::MEMCPY";
1783   case ARMISD::VLD1DUP:       return "ARMISD::VLD1DUP";
1784   case ARMISD::VLD2DUP:       return "ARMISD::VLD2DUP";
1785   case ARMISD::VLD3DUP:       return "ARMISD::VLD3DUP";
1786   case ARMISD::VLD4DUP:       return "ARMISD::VLD4DUP";
1787   case ARMISD::VLD1_UPD:      return "ARMISD::VLD1_UPD";
1788   case ARMISD::VLD2_UPD:      return "ARMISD::VLD2_UPD";
1789   case ARMISD::VLD3_UPD:      return "ARMISD::VLD3_UPD";
1790   case ARMISD::VLD4_UPD:      return "ARMISD::VLD4_UPD";
1791   case ARMISD::VLD2LN_UPD:    return "ARMISD::VLD2LN_UPD";
1792   case ARMISD::VLD3LN_UPD:    return "ARMISD::VLD3LN_UPD";
1793   case ARMISD::VLD4LN_UPD:    return "ARMISD::VLD4LN_UPD";
1794   case ARMISD::VLD1DUP_UPD:   return "ARMISD::VLD1DUP_UPD";
1795   case ARMISD::VLD2DUP_UPD:   return "ARMISD::VLD2DUP_UPD";
1796   case ARMISD::VLD3DUP_UPD:   return "ARMISD::VLD3DUP_UPD";
1797   case ARMISD::VLD4DUP_UPD:   return "ARMISD::VLD4DUP_UPD";
1798   case ARMISD::VST1_UPD:      return "ARMISD::VST1_UPD";
1799   case ARMISD::VST2_UPD:      return "ARMISD::VST2_UPD";
1800   case ARMISD::VST3_UPD:      return "ARMISD::VST3_UPD";
1801   case ARMISD::VST4_UPD:      return "ARMISD::VST4_UPD";
1802   case ARMISD::VST2LN_UPD:    return "ARMISD::VST2LN_UPD";
1803   case ARMISD::VST3LN_UPD:    return "ARMISD::VST3LN_UPD";
1804   case ARMISD::VST4LN_UPD:    return "ARMISD::VST4LN_UPD";
1805   case ARMISD::WLS:           return "ARMISD::WLS";
1806   case ARMISD::LE:            return "ARMISD::LE";
1807   case ARMISD::LOOP_DEC:      return "ARMISD::LOOP_DEC";
1808   case ARMISD::CSINV:         return "ARMISD::CSINV";
1809   case ARMISD::CSNEG:         return "ARMISD::CSNEG";
1810   case ARMISD::CSINC:         return "ARMISD::CSINC";
1811   }
1812   return nullptr;
1813 }
1814 
1815 EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
1816                                           EVT VT) const {
1817   if (!VT.isVector())
1818     return getPointerTy(DL);
1819 
1820   // MVE has a predicate register.
1821   if (Subtarget->hasMVEIntegerOps() &&
1822       (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8))
1823     return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
1824   return VT.changeVectorElementTypeToInteger();
1825 }
1826 
1827 /// getRegClassFor - Return the register class that should be used for the
1828 /// specified value type.
1829 const TargetRegisterClass *
1830 ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
1831   (void)isDivergent;
1832   // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1833   // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1834   // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
1835   // MVE Q registers.
1836   if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
1837     if (VT == MVT::v4i64)
1838       return &ARM::QQPRRegClass;
1839     if (VT == MVT::v8i64)
1840       return &ARM::QQQQPRRegClass;
1841   }
1842   return TargetLowering::getRegClassFor(VT);
1843 }
1844 
1845 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1846 // source/dest is aligned and the copy size is large enough. We therefore want
1847 // to align such objects passed to memory intrinsics.
1848 bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
1849                                                unsigned &PrefAlign) const {
1850   if (!isa<MemIntrinsic>(CI))
1851     return false;
1852   MinSize = 8;
1853   // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1854   // cycle faster than 4-byte aligned LDM.
1855   PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4);
1856   return true;
1857 }
1858 
1859 // Create a fast isel object.
1860 FastISel *
1861 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
1862                                   const TargetLibraryInfo *libInfo) const {
1863   return ARM::createFastISel(funcInfo, libInfo);
1864 }
1865 
1866 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
1867   unsigned NumVals = N->getNumValues();
1868   if (!NumVals)
1869     return Sched::RegPressure;
1870 
1871   for (unsigned i = 0; i != NumVals; ++i) {
1872     EVT VT = N->getValueType(i);
1873     if (VT == MVT::Glue || VT == MVT::Other)
1874       continue;
1875     if (VT.isFloatingPoint() || VT.isVector())
1876       return Sched::ILP;
1877   }
1878 
1879   if (!N->isMachineOpcode())
1880     return Sched::RegPressure;
1881 
1882   // Load are scheduled for latency even if there instruction itinerary
1883   // is not available.
1884   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1885   const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1886 
1887   if (MCID.getNumDefs() == 0)
1888     return Sched::RegPressure;
1889   if (!Itins->isEmpty() &&
1890       Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2)
1891     return Sched::ILP;
1892 
1893   return Sched::RegPressure;
1894 }
1895 
1896 //===----------------------------------------------------------------------===//
1897 // Lowering Code
1898 //===----------------------------------------------------------------------===//
1899 
1900 static bool isSRL16(const SDValue &Op) {
1901   if (Op.getOpcode() != ISD::SRL)
1902     return false;
1903   if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1904     return Const->getZExtValue() == 16;
1905   return false;
1906 }
1907 
1908 static bool isSRA16(const SDValue &Op) {
1909   if (Op.getOpcode() != ISD::SRA)
1910     return false;
1911   if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1912     return Const->getZExtValue() == 16;
1913   return false;
1914 }
1915 
1916 static bool isSHL16(const SDValue &Op) {
1917   if (Op.getOpcode() != ISD::SHL)
1918     return false;
1919   if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1920     return Const->getZExtValue() == 16;
1921   return false;
1922 }
1923 
1924 // Check for a signed 16-bit value. We special case SRA because it makes it
1925 // more simple when also looking for SRAs that aren't sign extending a
1926 // smaller value. Without the check, we'd need to take extra care with
1927 // checking order for some operations.
1928 static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
1929   if (isSRA16(Op))
1930     return isSHL16(Op.getOperand(0));
1931   return DAG.ComputeNumSignBits(Op) == 17;
1932 }
1933 
1934 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
1935 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
1936   switch (CC) {
1937   default: llvm_unreachable("Unknown condition code!");
1938   case ISD::SETNE:  return ARMCC::NE;
1939   case ISD::SETEQ:  return ARMCC::EQ;
1940   case ISD::SETGT:  return ARMCC::GT;
1941   case ISD::SETGE:  return ARMCC::GE;
1942   case ISD::SETLT:  return ARMCC::LT;
1943   case ISD::SETLE:  return ARMCC::LE;
1944   case ISD::SETUGT: return ARMCC::HI;
1945   case ISD::SETUGE: return ARMCC::HS;
1946   case ISD::SETULT: return ARMCC::LO;
1947   case ISD::SETULE: return ARMCC::LS;
1948   }
1949 }
1950 
1951 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
1952 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
1953                         ARMCC::CondCodes &CondCode2) {
1954   CondCode2 = ARMCC::AL;
1955   switch (CC) {
1956   default: llvm_unreachable("Unknown FP condition!");
1957   case ISD::SETEQ:
1958   case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
1959   case ISD::SETGT:
1960   case ISD::SETOGT: CondCode = ARMCC::GT; break;
1961   case ISD::SETGE:
1962   case ISD::SETOGE: CondCode = ARMCC::GE; break;
1963   case ISD::SETOLT: CondCode = ARMCC::MI; break;
1964   case ISD::SETOLE: CondCode = ARMCC::LS; break;
1965   case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
1966   case ISD::SETO:   CondCode = ARMCC::VC; break;
1967   case ISD::SETUO:  CondCode = ARMCC::VS; break;
1968   case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
1969   case ISD::SETUGT: CondCode = ARMCC::HI; break;
1970   case ISD::SETUGE: CondCode = ARMCC::PL; break;
1971   case ISD::SETLT:
1972   case ISD::SETULT: CondCode = ARMCC::LT; break;
1973   case ISD::SETLE:
1974   case ISD::SETULE: CondCode = ARMCC::LE; break;
1975   case ISD::SETNE:
1976   case ISD::SETUNE: CondCode = ARMCC::NE; break;
1977   }
1978 }
1979 
1980 //===----------------------------------------------------------------------===//
1981 //                      Calling Convention Implementation
1982 //===----------------------------------------------------------------------===//
1983 
1984 /// getEffectiveCallingConv - Get the effective calling convention, taking into
1985 /// account presence of floating point hardware and calling convention
1986 /// limitations, such as support for variadic functions.
1987 CallingConv::ID
1988 ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
1989                                            bool isVarArg) const {
1990   switch (CC) {
1991   default:
1992     report_fatal_error("Unsupported calling convention");
1993   case CallingConv::ARM_AAPCS:
1994   case CallingConv::ARM_APCS:
1995   case CallingConv::GHC:
1996   case CallingConv::CFGuard_Check:
1997     return CC;
1998   case CallingConv::PreserveMost:
1999     return CallingConv::PreserveMost;
2000   case CallingConv::ARM_AAPCS_VFP:
2001   case CallingConv::Swift:
2002     return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP;
2003   case CallingConv::C:
2004     if (!Subtarget->isAAPCS_ABI())
2005       return CallingConv::ARM_APCS;
2006     else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() &&
2007              getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
2008              !isVarArg)
2009       return CallingConv::ARM_AAPCS_VFP;
2010     else
2011       return CallingConv::ARM_AAPCS;
2012   case CallingConv::Fast:
2013   case CallingConv::CXX_FAST_TLS:
2014     if (!Subtarget->isAAPCS_ABI()) {
2015       if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)
2016         return CallingConv::Fast;
2017       return CallingConv::ARM_APCS;
2018     } else if (Subtarget->hasVFP2Base() &&
2019                !Subtarget->isThumb1Only() && !isVarArg)
2020       return CallingConv::ARM_AAPCS_VFP;
2021     else
2022       return CallingConv::ARM_AAPCS;
2023   }
2024 }
2025 
2026 CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
2027                                                  bool isVarArg) const {
2028   return CCAssignFnForNode(CC, false, isVarArg);
2029 }
2030 
2031 CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
2032                                                    bool isVarArg) const {
2033   return CCAssignFnForNode(CC, true, isVarArg);
2034 }
2035 
2036 /// CCAssignFnForNode - Selects the correct CCAssignFn for the given
2037 /// CallingConvention.
2038 CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
2039                                                  bool Return,
2040                                                  bool isVarArg) const {
2041   switch (getEffectiveCallingConv(CC, isVarArg)) {
2042   default:
2043     report_fatal_error("Unsupported calling convention");
2044   case CallingConv::ARM_APCS:
2045     return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
2046   case CallingConv::ARM_AAPCS:
2047     return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2048   case CallingConv::ARM_AAPCS_VFP:
2049     return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
2050   case CallingConv::Fast:
2051     return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
2052   case CallingConv::GHC:
2053     return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
2054   case CallingConv::PreserveMost:
2055     return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2056   case CallingConv::CFGuard_Check:
2057     return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);
2058   }
2059 }
2060 
2061 SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,
2062                                      MVT LocVT, MVT ValVT, SDValue Val) const {
2063   Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()),
2064                     Val);
2065   if (Subtarget->hasFullFP16()) {
2066     Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val);
2067   } else {
2068     Val = DAG.getNode(ISD::TRUNCATE, dl,
2069                       MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2070     Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val);
2071   }
2072   return Val;
2073 }
2074 
2075 SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,
2076                                        MVT LocVT, MVT ValVT,
2077                                        SDValue Val) const {
2078   if (Subtarget->hasFullFP16()) {
2079     Val = DAG.getNode(ARMISD::VMOVrh, dl,
2080                       MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2081   } else {
2082     Val = DAG.getNode(ISD::BITCAST, dl,
2083                       MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2084     Val = DAG.getNode(ISD::ZERO_EXTEND, dl,
2085                       MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2086   }
2087   return DAG.getNode(ISD::BITCAST, dl, LocVT, Val);
2088 }
2089 
2090 /// LowerCallResult - Lower the result values of a call into the
2091 /// appropriate copies out of appropriate physical registers.
2092 SDValue ARMTargetLowering::LowerCallResult(
2093     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2094     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2095     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
2096     SDValue ThisVal) const {
2097   // Assign locations to each value returned by this call.
2098   SmallVector<CCValAssign, 16> RVLocs;
2099   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2100                  *DAG.getContext());
2101   CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
2102 
2103   // Copy all of the result registers out of their specified physreg.
2104   for (unsigned i = 0; i != RVLocs.size(); ++i) {
2105     CCValAssign VA = RVLocs[i];
2106 
2107     // Pass 'this' value directly from the argument to return value, to avoid
2108     // reg unit interference
2109     if (i == 0 && isThisReturn) {
2110       assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
2111              "unexpected return calling convention register assignment");
2112       InVals.push_back(ThisVal);
2113       continue;
2114     }
2115 
2116     SDValue Val;
2117     if (VA.needsCustom() &&
2118         (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
2119       // Handle f64 or half of a v2f64.
2120       SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2121                                       InFlag);
2122       Chain = Lo.getValue(1);
2123       InFlag = Lo.getValue(2);
2124       VA = RVLocs[++i]; // skip ahead to next loc
2125       SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2126                                       InFlag);
2127       Chain = Hi.getValue(1);
2128       InFlag = Hi.getValue(2);
2129       if (!Subtarget->isLittle())
2130         std::swap (Lo, Hi);
2131       Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2132 
2133       if (VA.getLocVT() == MVT::v2f64) {
2134         SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
2135         Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2136                           DAG.getConstant(0, dl, MVT::i32));
2137 
2138         VA = RVLocs[++i]; // skip ahead to next loc
2139         Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
2140         Chain = Lo.getValue(1);
2141         InFlag = Lo.getValue(2);
2142         VA = RVLocs[++i]; // skip ahead to next loc
2143         Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
2144         Chain = Hi.getValue(1);
2145         InFlag = Hi.getValue(2);
2146         if (!Subtarget->isLittle())
2147           std::swap (Lo, Hi);
2148         Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2149         Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2150                           DAG.getConstant(1, dl, MVT::i32));
2151       }
2152     } else {
2153       Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
2154                                InFlag);
2155       Chain = Val.getValue(1);
2156       InFlag = Val.getValue(2);
2157     }
2158 
2159     switch (VA.getLocInfo()) {
2160     default: llvm_unreachable("Unknown loc info!");
2161     case CCValAssign::Full: break;
2162     case CCValAssign::BCvt:
2163       Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
2164       break;
2165     }
2166 
2167     // f16 arguments have their size extended to 4 bytes and passed as if they
2168     // had been copied to the LSBs of a 32-bit register.
2169     // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2170     if (VA.needsCustom() &&
2171         (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
2172       Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);
2173 
2174     InVals.push_back(Val);
2175   }
2176 
2177   return Chain;
2178 }
2179 
2180 /// LowerMemOpCallTo - Store the argument to the stack.
2181 SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
2182                                             SDValue Arg, const SDLoc &dl,
2183                                             SelectionDAG &DAG,
2184                                             const CCValAssign &VA,
2185                                             ISD::ArgFlagsTy Flags) const {
2186   unsigned LocMemOffset = VA.getLocMemOffset();
2187   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
2188   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
2189                        StackPtr, PtrOff);
2190   return DAG.getStore(
2191       Chain, dl, Arg, PtrOff,
2192       MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
2193 }
2194 
2195 void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
2196                                          SDValue Chain, SDValue &Arg,
2197                                          RegsToPassVector &RegsToPass,
2198                                          CCValAssign &VA, CCValAssign &NextVA,
2199                                          SDValue &StackPtr,
2200                                          SmallVectorImpl<SDValue> &MemOpChains,
2201                                          ISD::ArgFlagsTy Flags) const {
2202   SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
2203                               DAG.getVTList(MVT::i32, MVT::i32), Arg);
2204   unsigned id = Subtarget->isLittle() ? 0 : 1;
2205   RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
2206 
2207   if (NextVA.isRegLoc())
2208     RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
2209   else {
2210     assert(NextVA.isMemLoc());
2211     if (!StackPtr.getNode())
2212       StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
2213                                     getPointerTy(DAG.getDataLayout()));
2214 
2215     MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id),
2216                                            dl, DAG, NextVA,
2217                                            Flags));
2218   }
2219 }
2220 
2221 /// LowerCall - Lowering a call into a callseq_start <-
2222 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
2223 /// nodes.
2224 SDValue
2225 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2226                              SmallVectorImpl<SDValue> &InVals) const {
2227   SelectionDAG &DAG                     = CLI.DAG;
2228   SDLoc &dl                             = CLI.DL;
2229   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2230   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
2231   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
2232   SDValue Chain                         = CLI.Chain;
2233   SDValue Callee                        = CLI.Callee;
2234   bool &isTailCall                      = CLI.IsTailCall;
2235   CallingConv::ID CallConv              = CLI.CallConv;
2236   bool doesNotRet                       = CLI.DoesNotReturn;
2237   bool isVarArg                         = CLI.IsVarArg;
2238 
2239   MachineFunction &MF = DAG.getMachineFunction();
2240   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2241   MachineFunction::CallSiteInfo CSInfo;
2242   bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
2243   bool isThisReturn = false;
2244   bool isCmseNSCall   = false;
2245   bool PreferIndirect = false;
2246 
2247   // Determine whether this is a non-secure function call.
2248   if (CLI.CB && CLI.CB->getAttributes().hasFnAttribute("cmse_nonsecure_call"))
2249     isCmseNSCall = true;
2250 
2251   // Disable tail calls if they're not supported.
2252   if (!Subtarget->supportsTailCall())
2253     isTailCall = false;
2254 
2255   // For both the non-secure calls and the returns from a CMSE entry function,
2256   // the function needs to do some extra work afte r the call, or before the
2257   // return, respectively, thus it cannot end with atail call
2258   if (isCmseNSCall || AFI->isCmseNSEntryFunction())
2259     isTailCall = false;
2260 
2261   if (isa<GlobalAddressSDNode>(Callee)) {
2262     // If we're optimizing for minimum size and the function is called three or
2263     // more times in this block, we can improve codesize by calling indirectly
2264     // as BLXr has a 16-bit encoding.
2265     auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
2266     if (CLI.CB) {
2267       auto *BB = CLI.CB->getParent();
2268       PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
2269                        count_if(GV->users(), [&BB](const User *U) {
2270                          return isa<Instruction>(U) &&
2271                                 cast<Instruction>(U)->getParent() == BB;
2272                        }) > 2;
2273     }
2274   }
2275   if (isTailCall) {
2276     // Check if it's really possible to do a tail call.
2277     isTailCall = IsEligibleForTailCallOptimization(
2278         Callee, CallConv, isVarArg, isStructRet,
2279         MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG,
2280         PreferIndirect);
2281     if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
2282       report_fatal_error("failed to perform tail call elimination on a call "
2283                          "site marked musttail");
2284     // We don't support GuaranteedTailCallOpt for ARM, only automatically
2285     // detected sibcalls.
2286     if (isTailCall)
2287       ++NumTailCalls;
2288   }
2289 
2290   // Analyze operands of the call, assigning locations to each operand.
2291   SmallVector<CCValAssign, 16> ArgLocs;
2292   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2293                  *DAG.getContext());
2294   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
2295 
2296   // Get a count of how many bytes are to be pushed on the stack.
2297   unsigned NumBytes = CCInfo.getNextStackOffset();
2298 
2299   if (isTailCall) {
2300     // For tail calls, memory operands are available in our caller's stack.
2301     NumBytes = 0;
2302   } else {
2303     // Adjust the stack pointer for the new arguments...
2304     // These operations are automatically eliminated by the prolog/epilog pass
2305     Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
2306   }
2307 
2308   SDValue StackPtr =
2309       DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
2310 
2311   RegsToPassVector RegsToPass;
2312   SmallVector<SDValue, 8> MemOpChains;
2313 
2314   // Walk the register/memloc assignments, inserting copies/loads.  In the case
2315   // of tail call optimization, arguments are handled later.
2316   for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2317        i != e;
2318        ++i, ++realArgIdx) {
2319     CCValAssign &VA = ArgLocs[i];
2320     SDValue Arg = OutVals[realArgIdx];
2321     ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2322     bool isByVal = Flags.isByVal();
2323 
2324     // Promote the value if needed.
2325     switch (VA.getLocInfo()) {
2326     default: llvm_unreachable("Unknown loc info!");
2327     case CCValAssign::Full: break;
2328     case CCValAssign::SExt:
2329       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
2330       break;
2331     case CCValAssign::ZExt:
2332       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
2333       break;
2334     case CCValAssign::AExt:
2335       Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
2336       break;
2337     case CCValAssign::BCvt:
2338       Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2339       break;
2340     }
2341 
2342     // f16 arguments have their size extended to 4 bytes and passed as if they
2343     // had been copied to the LSBs of a 32-bit register.
2344     // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2345     if (VA.needsCustom() &&
2346         (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
2347       Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2348     } else {
2349       // f16 arguments could have been extended prior to argument lowering.
2350       // Mask them arguments if this is a CMSE nonsecure call.
2351       auto ArgVT = Outs[realArgIdx].ArgVT;
2352       if (isCmseNSCall && (ArgVT == MVT::f16)) {
2353         auto LocBits = VA.getLocVT().getSizeInBits();
2354         auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits());
2355         SDValue Mask =
2356             DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
2357         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
2358         Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2359         Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2360       }
2361     }
2362 
2363     // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
2364     if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
2365       SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2366                                 DAG.getConstant(0, dl, MVT::i32));
2367       SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2368                                 DAG.getConstant(1, dl, MVT::i32));
2369 
2370       PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
2371                        StackPtr, MemOpChains, Flags);
2372 
2373       VA = ArgLocs[++i]; // skip ahead to next loc
2374       if (VA.isRegLoc()) {
2375         PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
2376                          StackPtr, MemOpChains, Flags);
2377       } else {
2378         assert(VA.isMemLoc());
2379 
2380         MemOpChains.push_back(
2381             LowerMemOpCallTo(Chain, StackPtr, Op1, dl, DAG, VA, Flags));
2382       }
2383     } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
2384       PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
2385                        StackPtr, MemOpChains, Flags);
2386     } else if (VA.isRegLoc()) {
2387       if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
2388           Outs[0].VT == MVT::i32) {
2389         assert(VA.getLocVT() == MVT::i32 &&
2390                "unexpected calling convention register assignment");
2391         assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
2392                "unexpected use of 'returned'");
2393         isThisReturn = true;
2394       }
2395       const TargetOptions &Options = DAG.getTarget().Options;
2396       if (Options.EmitCallSiteInfo)
2397         CSInfo.emplace_back(VA.getLocReg(), i);
2398       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2399     } else if (isByVal) {
2400       assert(VA.isMemLoc());
2401       unsigned offset = 0;
2402 
2403       // True if this byval aggregate will be split between registers
2404       // and memory.
2405       unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
2406       unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
2407 
2408       if (CurByValIdx < ByValArgsCount) {
2409 
2410         unsigned RegBegin, RegEnd;
2411         CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
2412 
2413         EVT PtrVT =
2414             DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
2415         unsigned int i, j;
2416         for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
2417           SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
2418           SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
2419           SDValue Load =
2420               DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
2421                           DAG.InferPtrAlign(AddArg));
2422           MemOpChains.push_back(Load.getValue(1));
2423           RegsToPass.push_back(std::make_pair(j, Load));
2424         }
2425 
2426         // If parameter size outsides register area, "offset" value
2427         // helps us to calculate stack slot for remained part properly.
2428         offset = RegEnd - RegBegin;
2429 
2430         CCInfo.nextInRegsParam();
2431       }
2432 
2433       if (Flags.getByValSize() > 4*offset) {
2434         auto PtrVT = getPointerTy(DAG.getDataLayout());
2435         unsigned LocMemOffset = VA.getLocMemOffset();
2436         SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
2437         SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff);
2438         SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
2439         SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
2440         SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
2441                                            MVT::i32);
2442         SDValue AlignNode =
2443             DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2444 
2445         SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2446         SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
2447         MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
2448                                           Ops));
2449       }
2450     } else if (!isTailCall) {
2451       assert(VA.isMemLoc());
2452 
2453       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2454                                              dl, DAG, VA, Flags));
2455     }
2456   }
2457 
2458   if (!MemOpChains.empty())
2459     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2460 
2461   // Build a sequence of copy-to-reg nodes chained together with token chain
2462   // and flag operands which copy the outgoing args into the appropriate regs.
2463   SDValue InFlag;
2464   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2465     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2466                              RegsToPass[i].second, InFlag);
2467     InFlag = Chain.getValue(1);
2468   }
2469 
2470   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2471   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2472   // node so that legalize doesn't hack it.
2473   bool isDirect = false;
2474 
2475   const TargetMachine &TM = getTargetMachine();
2476   const Module *Mod = MF.getFunction().getParent();
2477   const GlobalValue *GV = nullptr;
2478   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
2479     GV = G->getGlobal();
2480   bool isStub =
2481       !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO();
2482 
2483   bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
2484   bool isLocalARMFunc = false;
2485   auto PtrVt = getPointerTy(DAG.getDataLayout());
2486 
2487   if (Subtarget->genLongCalls()) {
2488     assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
2489            "long-calls codegen is not position independent!");
2490     // Handle a global address or an external symbol. If it's not one of
2491     // those, the target's already in a register, so we don't need to do
2492     // anything extra.
2493     if (isa<GlobalAddressSDNode>(Callee)) {
2494       // Create a constant pool entry for the callee address
2495       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2496       ARMConstantPoolValue *CPV =
2497         ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);
2498 
2499       // Get the address of the callee into a register
2500       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2501       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2502       Callee = DAG.getLoad(
2503           PtrVt, dl, DAG.getEntryNode(), CPAddr,
2504           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2505     } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
2506       const char *Sym = S->getSymbol();
2507 
2508       // Create a constant pool entry for the callee address
2509       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2510       ARMConstantPoolValue *CPV =
2511         ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
2512                                       ARMPCLabelIndex, 0);
2513       // Get the address of the callee into a register
2514       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2515       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2516       Callee = DAG.getLoad(
2517           PtrVt, dl, DAG.getEntryNode(), CPAddr,
2518           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2519     }
2520   } else if (isa<GlobalAddressSDNode>(Callee)) {
2521     if (!PreferIndirect) {
2522       isDirect = true;
2523       bool isDef = GV->isStrongDefinitionForLinker();
2524 
2525       // ARM call to a local ARM function is predicable.
2526       isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2527       // tBX takes a register source operand.
2528       if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2529         assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
2530         Callee = DAG.getNode(
2531             ARMISD::WrapperPIC, dl, PtrVt,
2532             DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY));
2533         Callee = DAG.getLoad(
2534             PtrVt, dl, DAG.getEntryNode(), Callee,
2535             MachinePointerInfo::getGOT(DAG.getMachineFunction()), MaybeAlign(),
2536             MachineMemOperand::MODereferenceable |
2537                 MachineMemOperand::MOInvariant);
2538       } else if (Subtarget->isTargetCOFF()) {
2539         assert(Subtarget->isTargetWindows() &&
2540                "Windows is the only supported COFF target");
2541         unsigned TargetFlags = ARMII::MO_NO_FLAG;
2542         if (GV->hasDLLImportStorageClass())
2543           TargetFlags = ARMII::MO_DLLIMPORT;
2544         else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
2545           TargetFlags = ARMII::MO_COFFSTUB;
2546         Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*offset=*/0,
2547                                             TargetFlags);
2548         if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
2549           Callee =
2550               DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
2551                           DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
2552                           MachinePointerInfo::getGOT(DAG.getMachineFunction()));
2553       } else {
2554         Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0);
2555       }
2556     }
2557   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2558     isDirect = true;
2559     // tBX takes a register source operand.
2560     const char *Sym = S->getSymbol();
2561     if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2562       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2563       ARMConstantPoolValue *CPV =
2564         ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
2565                                       ARMPCLabelIndex, 4);
2566       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2567       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2568       Callee = DAG.getLoad(
2569           PtrVt, dl, DAG.getEntryNode(), CPAddr,
2570           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2571       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2572       Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
2573     } else {
2574       Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
2575     }
2576   }
2577 
2578   if (isCmseNSCall) {
2579     assert(!isARMFunc && !isDirect &&
2580            "Cannot handle call to ARM function or direct call");
2581     if (NumBytes > 0) {
2582       DiagnosticInfoUnsupported Diag(DAG.getMachineFunction().getFunction(),
2583                                      "call to non-secure function would "
2584                                      "require passing arguments on stack",
2585                                      dl.getDebugLoc());
2586       DAG.getContext()->diagnose(Diag);
2587     }
2588     if (isStructRet) {
2589       DiagnosticInfoUnsupported Diag(
2590           DAG.getMachineFunction().getFunction(),
2591           "call to non-secure function would return value through pointer",
2592           dl.getDebugLoc());
2593       DAG.getContext()->diagnose(Diag);
2594     }
2595   }
2596 
2597   // FIXME: handle tail calls differently.
2598   unsigned CallOpc;
2599   if (Subtarget->isThumb()) {
2600     if (isCmseNSCall)
2601       CallOpc = ARMISD::tSECALL;
2602     else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2603       CallOpc = ARMISD::CALL_NOLINK;
2604     else
2605       CallOpc = ARMISD::CALL;
2606   } else {
2607     if (!isDirect && !Subtarget->hasV5TOps())
2608       CallOpc = ARMISD::CALL_NOLINK;
2609     else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2610              // Emit regular call when code size is the priority
2611              !Subtarget->hasMinSize())
2612       // "mov lr, pc; b _foo" to avoid confusing the RSP
2613       CallOpc = ARMISD::CALL_NOLINK;
2614     else
2615       CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
2616   }
2617 
2618   std::vector<SDValue> Ops;
2619   Ops.push_back(Chain);
2620   Ops.push_back(Callee);
2621 
2622   // Add argument registers to the end of the list so that they are known live
2623   // into the call.
2624   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2625     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2626                                   RegsToPass[i].second.getValueType()));
2627 
2628   // Add a register mask operand representing the call-preserved registers.
2629   if (!isTailCall) {
2630     const uint32_t *Mask;
2631     const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2632     if (isThisReturn) {
2633       // For 'this' returns, use the R0-preserving mask if applicable
2634       Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2635       if (!Mask) {
2636         // Set isThisReturn to false if the calling convention is not one that
2637         // allows 'returned' to be modeled in this way, so LowerCallResult does
2638         // not try to pass 'this' straight through
2639         isThisReturn = false;
2640         Mask = ARI->getCallPreservedMask(MF, CallConv);
2641       }
2642     } else
2643       Mask = ARI->getCallPreservedMask(MF, CallConv);
2644 
2645     assert(Mask && "Missing call preserved mask for calling convention");
2646     Ops.push_back(DAG.getRegisterMask(Mask));
2647   }
2648 
2649   if (InFlag.getNode())
2650     Ops.push_back(InFlag);
2651 
2652   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2653   if (isTailCall) {
2654     MF.getFrameInfo().setHasTailCall();
2655     SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
2656     DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
2657     return Ret;
2658   }
2659 
2660   // Returns a chain and a flag for retval copy to use.
2661   Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
2662   DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
2663   InFlag = Chain.getValue(1);
2664   DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
2665 
2666   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
2667                              DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
2668   if (!Ins.empty())
2669     InFlag = Chain.getValue(1);
2670 
2671   // Handle result values, copying them out of physregs into vregs that we
2672   // return.
2673   return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
2674                          InVals, isThisReturn,
2675                          isThisReturn ? OutVals[0] : SDValue());
2676 }
2677 
2678 /// HandleByVal - Every parameter *after* a byval parameter is passed
2679 /// on the stack.  Remember the next parameter register to allocate,
2680 /// and then confiscate the rest of the parameter registers to insure
2681 /// this.
2682 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2683                                     Align Alignment) const {
2684   // Byval (as with any stack) slots are always at least 4 byte aligned.
2685   Alignment = std::max(Alignment, Align(4));
2686 
2687   unsigned Reg = State->AllocateReg(GPRArgRegs);
2688   if (!Reg)
2689     return;
2690 
2691   unsigned AlignInRegs = Alignment.value() / 4;
2692   unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2693   for (unsigned i = 0; i < Waste; ++i)
2694     Reg = State->AllocateReg(GPRArgRegs);
2695 
2696   if (!Reg)
2697     return;
2698 
2699   unsigned Excess = 4 * (ARM::R4 - Reg);
2700 
2701   // Special case when NSAA != SP and parameter size greater than size of
2702   // all remained GPR regs. In that case we can't split parameter, we must
2703   // send it to stack. We also must set NCRN to R4, so waste all
2704   // remained registers.
2705   const unsigned NSAAOffset = State->getNextStackOffset();
2706   if (NSAAOffset != 0 && Size > Excess) {
2707     while (State->AllocateReg(GPRArgRegs))
2708       ;
2709     return;
2710   }
2711 
2712   // First register for byval parameter is the first register that wasn't
2713   // allocated before this method call, so it would be "reg".
2714   // If parameter is small enough to be saved in range [reg, r4), then
2715   // the end (first after last) register would be reg + param-size-in-regs,
2716   // else parameter would be splitted between registers and stack,
2717   // end register would be r4 in this case.
2718   unsigned ByValRegBegin = Reg;
2719   unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2720   State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2721   // Note, first register is allocated in the beginning of function already,
2722   // allocate remained amount of registers we need.
2723   for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2724     State->AllocateReg(GPRArgRegs);
2725   // A byval parameter that is split between registers and memory needs its
2726   // size truncated here.
2727   // In the case where the entire structure fits in registers, we set the
2728   // size in memory to zero.
2729   Size = std::max<int>(Size - Excess, 0);
2730 }
2731 
2732 /// MatchingStackOffset - Return true if the given stack call argument is
2733 /// already available in the same position (relatively) of the caller's
2734 /// incoming argument stack.
2735 static
2736 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2737                          MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
2738                          const TargetInstrInfo *TII) {
2739   unsigned Bytes = Arg.getValueSizeInBits() / 8;
2740   int FI = std::numeric_limits<int>::max();
2741   if (Arg.getOpcode() == ISD::CopyFromReg) {
2742     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2743     if (!Register::isVirtualRegister(VR))
2744       return false;
2745     MachineInstr *Def = MRI->getVRegDef(VR);
2746     if (!Def)
2747       return false;
2748     if (!Flags.isByVal()) {
2749       if (!TII->isLoadFromStackSlot(*Def, FI))
2750         return false;
2751     } else {
2752       return false;
2753     }
2754   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2755     if (Flags.isByVal())
2756       // ByVal argument is passed in as a pointer but it's now being
2757       // dereferenced. e.g.
2758       // define @foo(%struct.X* %A) {
2759       //   tail call @bar(%struct.X* byval %A)
2760       // }
2761       return false;
2762     SDValue Ptr = Ld->getBasePtr();
2763     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2764     if (!FINode)
2765       return false;
2766     FI = FINode->getIndex();
2767   } else
2768     return false;
2769 
2770   assert(FI != std::numeric_limits<int>::max());
2771   if (!MFI.isFixedObjectIndex(FI))
2772     return false;
2773   return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI);
2774 }
2775 
2776 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
2777 /// for tail call optimization. Targets which want to do tail call
2778 /// optimization should implement this function.
2779 bool ARMTargetLowering::IsEligibleForTailCallOptimization(
2780     SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
2781     bool isCalleeStructRet, bool isCallerStructRet,
2782     const SmallVectorImpl<ISD::OutputArg> &Outs,
2783     const SmallVectorImpl<SDValue> &OutVals,
2784     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG,
2785     const bool isIndirect) const {
2786   MachineFunction &MF = DAG.getMachineFunction();
2787   const Function &CallerF = MF.getFunction();
2788   CallingConv::ID CallerCC = CallerF.getCallingConv();
2789 
2790   assert(Subtarget->supportsTailCall());
2791 
2792   // Indirect tail calls cannot be optimized for Thumb1 if the args
2793   // to the call take up r0-r3. The reason is that there are no legal registers
2794   // left to hold the pointer to the function to be called.
2795   if (Subtarget->isThumb1Only() && Outs.size() >= 4 &&
2796       (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect))
2797     return false;
2798 
2799   // Look for obvious safe cases to perform tail call optimization that do not
2800   // require ABI changes. This is what gcc calls sibcall.
2801 
2802   // Exception-handling functions need a special set of instructions to indicate
2803   // a return to the hardware. Tail-calling another function would probably
2804   // break this.
2805   if (CallerF.hasFnAttribute("interrupt"))
2806     return false;
2807 
2808   // Also avoid sibcall optimization if either caller or callee uses struct
2809   // return semantics.
2810   if (isCalleeStructRet || isCallerStructRet)
2811     return false;
2812 
2813   // Externally-defined functions with weak linkage should not be
2814   // tail-called on ARM when the OS does not support dynamic
2815   // pre-emption of symbols, as the AAELF spec requires normal calls
2816   // to undefined weak functions to be replaced with a NOP or jump to the
2817   // next instruction. The behaviour of branch instructions in this
2818   // situation (as used for tail calls) is implementation-defined, so we
2819   // cannot rely on the linker replacing the tail call with a return.
2820   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2821     const GlobalValue *GV = G->getGlobal();
2822     const Triple &TT = getTargetMachine().getTargetTriple();
2823     if (GV->hasExternalWeakLinkage() &&
2824         (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
2825       return false;
2826   }
2827 
2828   // Check that the call results are passed in the same way.
2829   LLVMContext &C = *DAG.getContext();
2830   if (!CCState::resultsCompatible(
2831           getEffectiveCallingConv(CalleeCC, isVarArg),
2832           getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,
2833           CCAssignFnForReturn(CalleeCC, isVarArg),
2834           CCAssignFnForReturn(CallerCC, CallerF.isVarArg())))
2835     return false;
2836   // The callee has to preserve all registers the caller needs to preserve.
2837   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
2838   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2839   if (CalleeCC != CallerCC) {
2840     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2841     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2842       return false;
2843   }
2844 
2845   // If Caller's vararg or byval argument has been split between registers and
2846   // stack, do not perform tail call, since part of the argument is in caller's
2847   // local frame.
2848   const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
2849   if (AFI_Caller->getArgRegsSaveSize())
2850     return false;
2851 
2852   // If the callee takes no arguments then go on to check the results of the
2853   // call.
2854   if (!Outs.empty()) {
2855     // Check if stack adjustment is needed. For now, do not do this if any
2856     // argument is passed on the stack.
2857     SmallVector<CCValAssign, 16> ArgLocs;
2858     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
2859     CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
2860     if (CCInfo.getNextStackOffset()) {
2861       // Check if the arguments are already laid out in the right way as
2862       // the caller's fixed stack objects.
2863       MachineFrameInfo &MFI = MF.getFrameInfo();
2864       const MachineRegisterInfo *MRI = &MF.getRegInfo();
2865       const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2866       for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2867            i != e;
2868            ++i, ++realArgIdx) {
2869         CCValAssign &VA = ArgLocs[i];
2870         EVT RegVT = VA.getLocVT();
2871         SDValue Arg = OutVals[realArgIdx];
2872         ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2873         if (VA.getLocInfo() == CCValAssign::Indirect)
2874           return false;
2875         if (VA.needsCustom() && (RegVT == MVT::f64 || RegVT == MVT::v2f64)) {
2876           // f64 and vector types are split into multiple registers or
2877           // register/stack-slot combinations.  The types will not match
2878           // the registers; give up on memory f64 refs until we figure
2879           // out what to do about this.
2880           if (!VA.isRegLoc())
2881             return false;
2882           if (!ArgLocs[++i].isRegLoc())
2883             return false;
2884           if (RegVT == MVT::v2f64) {
2885             if (!ArgLocs[++i].isRegLoc())
2886               return false;
2887             if (!ArgLocs[++i].isRegLoc())
2888               return false;
2889           }
2890         } else if (!VA.isRegLoc()) {
2891           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
2892                                    MFI, MRI, TII))
2893             return false;
2894         }
2895       }
2896     }
2897 
2898     const MachineRegisterInfo &MRI = MF.getRegInfo();
2899     if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
2900       return false;
2901   }
2902 
2903   return true;
2904 }
2905 
2906 bool
2907 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2908                                   MachineFunction &MF, bool isVarArg,
2909                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
2910                                   LLVMContext &Context) const {
2911   SmallVector<CCValAssign, 16> RVLocs;
2912   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2913   return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2914 }
2915 
2916 static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
2917                                     const SDLoc &DL, SelectionDAG &DAG) {
2918   const MachineFunction &MF = DAG.getMachineFunction();
2919   const Function &F = MF.getFunction();
2920 
2921   StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
2922 
2923   // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
2924   // version of the "preferred return address". These offsets affect the return
2925   // instruction if this is a return from PL1 without hypervisor extensions.
2926   //    IRQ/FIQ: +4     "subs pc, lr, #4"
2927   //    SWI:     0      "subs pc, lr, #0"
2928   //    ABORT:   +4     "subs pc, lr, #4"
2929   //    UNDEF:   +4/+2  "subs pc, lr, #0"
2930   // UNDEF varies depending on where the exception came from ARM or Thumb
2931   // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
2932 
2933   int64_t LROffset;
2934   if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
2935       IntKind == "ABORT")
2936     LROffset = 4;
2937   else if (IntKind == "SWI" || IntKind == "UNDEF")
2938     LROffset = 0;
2939   else
2940     report_fatal_error("Unsupported interrupt attribute. If present, value "
2941                        "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
2942 
2943   RetOps.insert(RetOps.begin() + 1,
2944                 DAG.getConstant(LROffset, DL, MVT::i32, false));
2945 
2946   return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps);
2947 }
2948 
2949 SDValue
2950 ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2951                                bool isVarArg,
2952                                const SmallVectorImpl<ISD::OutputArg> &Outs,
2953                                const SmallVectorImpl<SDValue> &OutVals,
2954                                const SDLoc &dl, SelectionDAG &DAG) const {
2955   // CCValAssign - represent the assignment of the return value to a location.
2956   SmallVector<CCValAssign, 16> RVLocs;
2957 
2958   // CCState - Info about the registers and stack slots.
2959   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2960                  *DAG.getContext());
2961 
2962   // Analyze outgoing return values.
2963   CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2964 
2965   SDValue Flag;
2966   SmallVector<SDValue, 4> RetOps;
2967   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2968   bool isLittleEndian = Subtarget->isLittle();
2969 
2970   MachineFunction &MF = DAG.getMachineFunction();
2971   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2972   AFI->setReturnRegsCount(RVLocs.size());
2973 
2974  // Report error if cmse entry function returns structure through first ptr arg.
2975   if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {
2976     // Note: using an empty SDLoc(), as the first line of the function is a
2977     // better place to report than the last line.
2978     DiagnosticInfoUnsupported Diag(
2979         DAG.getMachineFunction().getFunction(),
2980         "secure entry function would return value through pointer",
2981         SDLoc().getDebugLoc());
2982     DAG.getContext()->diagnose(Diag);
2983   }
2984 
2985   // Copy the result values into the output registers.
2986   for (unsigned i = 0, realRVLocIdx = 0;
2987        i != RVLocs.size();
2988        ++i, ++realRVLocIdx) {
2989     CCValAssign &VA = RVLocs[i];
2990     assert(VA.isRegLoc() && "Can only return in registers!");
2991 
2992     SDValue Arg = OutVals[realRVLocIdx];
2993     bool ReturnF16 = false;
2994 
2995     if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) {
2996       // Half-precision return values can be returned like this:
2997       //
2998       // t11 f16 = fadd ...
2999       // t12: i16 = bitcast t11
3000       //   t13: i32 = zero_extend t12
3001       // t14: f32 = bitcast t13  <~~~~~~~ Arg
3002       //
3003       // to avoid code generation for bitcasts, we simply set Arg to the node
3004       // that produces the f16 value, t11 in this case.
3005       //
3006       if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
3007         SDValue ZE = Arg.getOperand(0);
3008         if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
3009           SDValue BC = ZE.getOperand(0);
3010           if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
3011             Arg = BC.getOperand(0);
3012             ReturnF16 = true;
3013           }
3014         }
3015       }
3016     }
3017 
3018     switch (VA.getLocInfo()) {
3019     default: llvm_unreachable("Unknown loc info!");
3020     case CCValAssign::Full: break;
3021     case CCValAssign::BCvt:
3022       if (!ReturnF16)
3023         Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3024       break;
3025     }
3026 
3027     // Mask f16 arguments if this is a CMSE nonsecure entry.
3028     auto RetVT = Outs[realRVLocIdx].ArgVT;
3029     if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {
3030       if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
3031         Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
3032       } else {
3033         auto LocBits = VA.getLocVT().getSizeInBits();
3034         auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits());
3035         SDValue Mask =
3036             DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
3037         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
3038         Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
3039         Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3040       }
3041     }
3042 
3043     if (VA.needsCustom() &&
3044         (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
3045       if (VA.getLocVT() == MVT::v2f64) {
3046         // Extract the first half and return it in two registers.
3047         SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3048                                    DAG.getConstant(0, dl, MVT::i32));
3049         SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
3050                                        DAG.getVTList(MVT::i32, MVT::i32), Half);
3051 
3052         Chain =
3053             DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3054                              HalfGPRs.getValue(isLittleEndian ? 0 : 1), Flag);
3055         Flag = Chain.getValue(1);
3056         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3057         VA = RVLocs[++i]; // skip ahead to next loc
3058         Chain =
3059             DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3060                              HalfGPRs.getValue(isLittleEndian ? 1 : 0), Flag);
3061         Flag = Chain.getValue(1);
3062         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3063         VA = RVLocs[++i]; // skip ahead to next loc
3064 
3065         // Extract the 2nd half and fall through to handle it as an f64 value.
3066         Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3067                           DAG.getConstant(1, dl, MVT::i32));
3068       }
3069       // Legalize ret f64 -> ret 2 x i32.  We always have fmrrd if f64 is
3070       // available.
3071       SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
3072                                   DAG.getVTList(MVT::i32, MVT::i32), Arg);
3073       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3074                                fmrrd.getValue(isLittleEndian ? 0 : 1), Flag);
3075       Flag = Chain.getValue(1);
3076       RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3077       VA = RVLocs[++i]; // skip ahead to next loc
3078       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3079                                fmrrd.getValue(isLittleEndian ? 1 : 0), Flag);
3080     } else
3081       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
3082 
3083     // Guarantee that all emitted copies are
3084     // stuck together, avoiding something bad.
3085     Flag = Chain.getValue(1);
3086     RetOps.push_back(DAG.getRegister(
3087         VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT()));
3088   }
3089   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3090   const MCPhysReg *I =
3091       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3092   if (I) {
3093     for (; *I; ++I) {
3094       if (ARM::GPRRegClass.contains(*I))
3095         RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3096       else if (ARM::DPRRegClass.contains(*I))
3097         RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
3098       else
3099         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3100     }
3101   }
3102 
3103   // Update chain and glue.
3104   RetOps[0] = Chain;
3105   if (Flag.getNode())
3106     RetOps.push_back(Flag);
3107 
3108   // CPUs which aren't M-class use a special sequence to return from
3109   // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
3110   // though we use "subs pc, lr, #N").
3111   //
3112   // M-class CPUs actually use a normal return sequence with a special
3113   // (hardware-provided) value in LR, so the normal code path works.
3114   if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
3115       !Subtarget->isMClass()) {
3116     if (Subtarget->isThumb1Only())
3117       report_fatal_error("interrupt attribute is not supported in Thumb1");
3118     return LowerInterruptReturn(RetOps, dl, DAG);
3119   }
3120 
3121   ARMISD::NodeType RetNode = AFI->isCmseNSEntryFunction() ? ARMISD::SERET_FLAG :
3122                                                             ARMISD::RET_FLAG;
3123   return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
3124 }
3125 
3126 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3127   if (N->getNumValues() != 1)
3128     return false;
3129   if (!N->hasNUsesOfValue(1, 0))
3130     return false;
3131 
3132   SDValue TCChain = Chain;
3133   SDNode *Copy = *N->use_begin();
3134   if (Copy->getOpcode() == ISD::CopyToReg) {
3135     // If the copy has a glue operand, we conservatively assume it isn't safe to
3136     // perform a tail call.
3137     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3138       return false;
3139     TCChain = Copy->getOperand(0);
3140   } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
3141     SDNode *VMov = Copy;
3142     // f64 returned in a pair of GPRs.
3143     SmallPtrSet<SDNode*, 2> Copies;
3144     for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
3145          UI != UE; ++UI) {
3146       if (UI->getOpcode() != ISD::CopyToReg)
3147         return false;
3148       Copies.insert(*UI);
3149     }
3150     if (Copies.size() > 2)
3151       return false;
3152 
3153     for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
3154          UI != UE; ++UI) {
3155       SDValue UseChain = UI->getOperand(0);
3156       if (Copies.count(UseChain.getNode()))
3157         // Second CopyToReg
3158         Copy = *UI;
3159       else {
3160         // We are at the top of this chain.
3161         // If the copy has a glue operand, we conservatively assume it
3162         // isn't safe to perform a tail call.
3163         if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue)
3164           return false;
3165         // First CopyToReg
3166         TCChain = UseChain;
3167       }
3168     }
3169   } else if (Copy->getOpcode() == ISD::BITCAST) {
3170     // f32 returned in a single GPR.
3171     if (!Copy->hasOneUse())
3172       return false;
3173     Copy = *Copy->use_begin();
3174     if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
3175       return false;
3176     // If the copy has a glue operand, we conservatively assume it isn't safe to
3177     // perform a tail call.
3178     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3179       return false;
3180     TCChain = Copy->getOperand(0);
3181   } else {
3182     return false;
3183   }
3184 
3185   bool HasRet = false;
3186   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
3187        UI != UE; ++UI) {
3188     if (UI->getOpcode() != ARMISD::RET_FLAG &&
3189         UI->getOpcode() != ARMISD::INTRET_FLAG)
3190       return false;
3191     HasRet = true;
3192   }
3193 
3194   if (!HasRet)
3195     return false;
3196 
3197   Chain = TCChain;
3198   return true;
3199 }
3200 
3201 bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3202   if (!Subtarget->supportsTailCall())
3203     return false;
3204 
3205   if (!CI->isTailCall())
3206     return false;
3207 
3208   return true;
3209 }
3210 
3211 // Trying to write a 64 bit value so need to split into two 32 bit values first,
3212 // and pass the lower and high parts through.
3213 static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) {
3214   SDLoc DL(Op);
3215   SDValue WriteValue = Op->getOperand(2);
3216 
3217   // This function is only supposed to be called for i64 type argument.
3218   assert(WriteValue.getValueType() == MVT::i64
3219           && "LowerWRITE_REGISTER called for non-i64 type argument.");
3220 
3221   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
3222                            DAG.getConstant(0, DL, MVT::i32));
3223   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
3224                            DAG.getConstant(1, DL, MVT::i32));
3225   SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
3226   return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
3227 }
3228 
3229 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
3230 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
3231 // one of the above mentioned nodes. It has to be wrapped because otherwise
3232 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
3233 // be used to form addressing mode. These wrapped nodes will be selected
3234 // into MOVi.
3235 SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
3236                                              SelectionDAG &DAG) const {
3237   EVT PtrVT = Op.getValueType();
3238   // FIXME there is no actual debug info here
3239   SDLoc dl(Op);
3240   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
3241   SDValue Res;
3242 
3243   // When generating execute-only code Constant Pools must be promoted to the
3244   // global data section. It's a bit ugly that we can't share them across basic
3245   // blocks, but this way we guarantee that execute-only behaves correct with
3246   // position-independent addressing modes.
3247   if (Subtarget->genExecuteOnly()) {
3248     auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
3249     auto T = const_cast<Type*>(CP->getType());
3250     auto C = const_cast<Constant*>(CP->getConstVal());
3251     auto M = const_cast<Module*>(DAG.getMachineFunction().
3252                                  getFunction().getParent());
3253     auto GV = new GlobalVariable(
3254                     *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
3255                     Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
3256                     Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +
3257                     Twine(AFI->createPICLabelUId())
3258                   );
3259     SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV),
3260                                             dl, PtrVT);
3261     return LowerGlobalAddress(GA, DAG);
3262   }
3263 
3264   if (CP->isMachineConstantPoolEntry())
3265     Res =
3266         DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CP->getAlign());
3267   else
3268     Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlign());
3269   return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
3270 }
3271 
3272 unsigned ARMTargetLowering::getJumpTableEncoding() const {
3273   return MachineJumpTableInfo::EK_Inline;
3274 }
3275 
3276 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
3277                                              SelectionDAG &DAG) const {
3278   MachineFunction &MF = DAG.getMachineFunction();
3279   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3280   unsigned ARMPCLabelIndex = 0;
3281   SDLoc DL(Op);
3282   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3283   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
3284   SDValue CPAddr;
3285   bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
3286   if (!IsPositionIndependent) {
3287     CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4));
3288   } else {
3289     unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
3290     ARMPCLabelIndex = AFI->createPICLabelUId();
3291     ARMConstantPoolValue *CPV =
3292       ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
3293                                       ARMCP::CPBlockAddress, PCAdj);
3294     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3295   }
3296   CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
3297   SDValue Result = DAG.getLoad(
3298       PtrVT, DL, DAG.getEntryNode(), CPAddr,
3299       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3300   if (!IsPositionIndependent)
3301     return Result;
3302   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
3303   return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
3304 }
3305 
3306 /// Convert a TLS address reference into the correct sequence of loads
3307 /// and calls to compute the variable's address for Darwin, and return an
3308 /// SDValue containing the final node.
3309 
3310 /// Darwin only has one TLS scheme which must be capable of dealing with the
3311 /// fully general situation, in the worst case. This means:
3312 ///     + "extern __thread" declaration.
3313 ///     + Defined in a possibly unknown dynamic library.
3314 ///
3315 /// The general system is that each __thread variable has a [3 x i32] descriptor
3316 /// which contains information used by the runtime to calculate the address. The
3317 /// only part of this the compiler needs to know about is the first word, which
3318 /// contains a function pointer that must be called with the address of the
3319 /// entire descriptor in "r0".
3320 ///
3321 /// Since this descriptor may be in a different unit, in general access must
3322 /// proceed along the usual ARM rules. A common sequence to produce is:
3323 ///
3324 ///     movw rT1, :lower16:_var$non_lazy_ptr
3325 ///     movt rT1, :upper16:_var$non_lazy_ptr
3326 ///     ldr r0, [rT1]
3327 ///     ldr rT2, [r0]
3328 ///     blx rT2
3329 ///     [...address now in r0...]
3330 SDValue
3331 ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
3332                                                SelectionDAG &DAG) const {
3333   assert(Subtarget->isTargetDarwin() &&
3334          "This function expects a Darwin target");
3335   SDLoc DL(Op);
3336 
3337   // First step is to get the address of the actua global symbol. This is where
3338   // the TLS descriptor lives.
3339   SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
3340 
3341   // The first entry in the descriptor is a function pointer that we must call
3342   // to obtain the address of the variable.
3343   SDValue Chain = DAG.getEntryNode();
3344   SDValue FuncTLVGet = DAG.getLoad(
3345       MVT::i32, DL, Chain, DescAddr,
3346       MachinePointerInfo::getGOT(DAG.getMachineFunction()), Align(4),
3347       MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable |
3348           MachineMemOperand::MOInvariant);
3349   Chain = FuncTLVGet.getValue(1);
3350 
3351   MachineFunction &F = DAG.getMachineFunction();
3352   MachineFrameInfo &MFI = F.getFrameInfo();
3353   MFI.setAdjustsStack(true);
3354 
3355   // TLS calls preserve all registers except those that absolutely must be
3356   // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
3357   // silly).
3358   auto TRI =
3359       getTargetMachine().getSubtargetImpl(F.getFunction())->getRegisterInfo();
3360   auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
3361   const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
3362 
3363   // Finally, we can make the call. This is just a degenerate version of a
3364   // normal AArch64 call node: r0 takes the address of the descriptor, and
3365   // returns the address of the variable in this thread.
3366   Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
3367   Chain =
3368       DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
3369                   Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
3370                   DAG.getRegisterMask(Mask), Chain.getValue(1));
3371   return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
3372 }
3373 
3374 SDValue
3375 ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
3376                                                 SelectionDAG &DAG) const {
3377   assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
3378 
3379   SDValue Chain = DAG.getEntryNode();
3380   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3381   SDLoc DL(Op);
3382 
3383   // Load the current TEB (thread environment block)
3384   SDValue Ops[] = {Chain,
3385                    DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
3386                    DAG.getTargetConstant(15, DL, MVT::i32),
3387                    DAG.getTargetConstant(0, DL, MVT::i32),
3388                    DAG.getTargetConstant(13, DL, MVT::i32),
3389                    DAG.getTargetConstant(0, DL, MVT::i32),
3390                    DAG.getTargetConstant(2, DL, MVT::i32)};
3391   SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
3392                                    DAG.getVTList(MVT::i32, MVT::Other), Ops);
3393 
3394   SDValue TEB = CurrentTEB.getValue(0);
3395   Chain = CurrentTEB.getValue(1);
3396 
3397   // Load the ThreadLocalStoragePointer from the TEB
3398   // A pointer to the TLS array is located at offset 0x2c from the TEB.
3399   SDValue TLSArray =
3400       DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
3401   TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
3402 
3403   // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
3404   // offset into the TLSArray.
3405 
3406   // Load the TLS index from the C runtime
3407   SDValue TLSIndex =
3408       DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
3409   TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
3410   TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
3411 
3412   SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
3413                               DAG.getConstant(2, DL, MVT::i32));
3414   SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
3415                             DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
3416                             MachinePointerInfo());
3417 
3418   // Get the offset of the start of the .tls section (section base)
3419   const auto *GA = cast<GlobalAddressSDNode>(Op);
3420   auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
3421   SDValue Offset = DAG.getLoad(
3422       PtrVT, DL, Chain,
3423       DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
3424                   DAG.getTargetConstantPool(CPV, PtrVT, Align(4))),
3425       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3426 
3427   return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
3428 }
3429 
3430 // Lower ISD::GlobalTLSAddress using the "general dynamic" model
3431 SDValue
3432 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
3433                                                  SelectionDAG &DAG) const {
3434   SDLoc dl(GA);
3435   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3436   unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3437   MachineFunction &MF = DAG.getMachineFunction();
3438   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3439   unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3440   ARMConstantPoolValue *CPV =
3441     ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3442                                     ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
3443   SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3444   Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
3445   Argument = DAG.getLoad(
3446       PtrVT, dl, DAG.getEntryNode(), Argument,
3447       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3448   SDValue Chain = Argument.getValue(1);
3449 
3450   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3451   Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
3452 
3453   // call __tls_get_addr.
3454   ArgListTy Args;
3455   ArgListEntry Entry;
3456   Entry.Node = Argument;
3457   Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
3458   Args.push_back(Entry);
3459 
3460   // FIXME: is there useful debug info available here?
3461   TargetLowering::CallLoweringInfo CLI(DAG);
3462   CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
3463       CallingConv::C, Type::getInt32Ty(*DAG.getContext()),
3464       DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
3465 
3466   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3467   return CallResult.first;
3468 }
3469 
3470 // Lower ISD::GlobalTLSAddress using the "initial exec" or
3471 // "local exec" model.
3472 SDValue
3473 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
3474                                         SelectionDAG &DAG,
3475                                         TLSModel::Model model) const {
3476   const GlobalValue *GV = GA->getGlobal();
3477   SDLoc dl(GA);
3478   SDValue Offset;
3479   SDValue Chain = DAG.getEntryNode();
3480   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3481   // Get the Thread Pointer
3482   SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
3483 
3484   if (model == TLSModel::InitialExec) {
3485     MachineFunction &MF = DAG.getMachineFunction();
3486     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3487     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3488     // Initial exec model.
3489     unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3490     ARMConstantPoolValue *CPV =
3491       ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3492                                       ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,
3493                                       true);
3494     Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3495     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3496     Offset = DAG.getLoad(
3497         PtrVT, dl, Chain, Offset,
3498         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3499     Chain = Offset.getValue(1);
3500 
3501     SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3502     Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
3503 
3504     Offset = DAG.getLoad(
3505         PtrVT, dl, Chain, Offset,
3506         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3507   } else {
3508     // local exec model
3509     assert(model == TLSModel::LocalExec);
3510     ARMConstantPoolValue *CPV =
3511       ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
3512     Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3513     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3514     Offset = DAG.getLoad(
3515         PtrVT, dl, Chain, Offset,
3516         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3517   }
3518 
3519   // The address of the thread local variable is the add of the thread
3520   // pointer with the offset of the variable.
3521   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
3522 }
3523 
3524 SDValue
3525 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
3526   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3527   if (DAG.getTarget().useEmulatedTLS())
3528     return LowerToTLSEmulatedModel(GA, DAG);
3529 
3530   if (Subtarget->isTargetDarwin())
3531     return LowerGlobalTLSAddressDarwin(Op, DAG);
3532 
3533   if (Subtarget->isTargetWindows())
3534     return LowerGlobalTLSAddressWindows(Op, DAG);
3535 
3536   // TODO: implement the "local dynamic" model
3537   assert(Subtarget->isTargetELF() && "Only ELF implemented here");
3538   TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal());
3539 
3540   switch (model) {
3541     case TLSModel::GeneralDynamic:
3542     case TLSModel::LocalDynamic:
3543       return LowerToTLSGeneralDynamicModel(GA, DAG);
3544     case TLSModel::InitialExec:
3545     case TLSModel::LocalExec:
3546       return LowerToTLSExecModels(GA, DAG, model);
3547   }
3548   llvm_unreachable("bogus TLS model");
3549 }
3550 
3551 /// Return true if all users of V are within function F, looking through
3552 /// ConstantExprs.
3553 static bool allUsersAreInFunction(const Value *V, const Function *F) {
3554   SmallVector<const User*,4> Worklist;
3555   for (auto *U : V->users())
3556     Worklist.push_back(U);
3557   while (!Worklist.empty()) {
3558     auto *U = Worklist.pop_back_val();
3559     if (isa<ConstantExpr>(U)) {
3560       append_range(Worklist, U->users());
3561       continue;
3562     }
3563 
3564     auto *I = dyn_cast<Instruction>(U);
3565     if (!I || I->getParent()->getParent() != F)
3566       return false;
3567   }
3568   return true;
3569 }
3570 
3571 static SDValue promoteToConstantPool(const ARMTargetLowering *TLI,
3572                                      const GlobalValue *GV, SelectionDAG &DAG,
3573                                      EVT PtrVT, const SDLoc &dl) {
3574   // If we're creating a pool entry for a constant global with unnamed address,
3575   // and the global is small enough, we can emit it inline into the constant pool
3576   // to save ourselves an indirection.
3577   //
3578   // This is a win if the constant is only used in one function (so it doesn't
3579   // need to be duplicated) or duplicating the constant wouldn't increase code
3580   // size (implying the constant is no larger than 4 bytes).
3581   const Function &F = DAG.getMachineFunction().getFunction();
3582 
3583   // We rely on this decision to inline being idemopotent and unrelated to the
3584   // use-site. We know that if we inline a variable at one use site, we'll
3585   // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
3586   // doesn't know about this optimization, so bail out if it's enabled else
3587   // we could decide to inline here (and thus never emit the GV) but require
3588   // the GV from fast-isel generated code.
3589   if (!EnableConstpoolPromotion ||
3590       DAG.getMachineFunction().getTarget().Options.EnableFastISel)
3591       return SDValue();
3592 
3593   auto *GVar = dyn_cast<GlobalVariable>(GV);
3594   if (!GVar || !GVar->hasInitializer() ||
3595       !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
3596       !GVar->hasLocalLinkage())
3597     return SDValue();
3598 
3599   // If we inline a value that contains relocations, we move the relocations
3600   // from .data to .text. This is not allowed in position-independent code.
3601   auto *Init = GVar->getInitializer();
3602   if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) &&
3603       Init->needsRelocation())
3604     return SDValue();
3605 
3606   // The constant islands pass can only really deal with alignment requests
3607   // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
3608   // any type wanting greater alignment requirements than 4 bytes. We also
3609   // can only promote constants that are multiples of 4 bytes in size or
3610   // are paddable to a multiple of 4. Currently we only try and pad constants
3611   // that are strings for simplicity.
3612   auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
3613   unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
3614   Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar);
3615   unsigned RequiredPadding = 4 - (Size % 4);
3616   bool PaddingPossible =
3617     RequiredPadding == 4 || (CDAInit && CDAInit->isString());
3618   if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize ||
3619       Size == 0)
3620     return SDValue();
3621 
3622   unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
3623   MachineFunction &MF = DAG.getMachineFunction();
3624   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3625 
3626   // We can't bloat the constant pool too much, else the ConstantIslands pass
3627   // may fail to converge. If we haven't promoted this global yet (it may have
3628   // multiple uses), and promoting it would increase the constant pool size (Sz
3629   // > 4), ensure we have space to do so up to MaxTotal.
3630   if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
3631     if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
3632         ConstpoolPromotionMaxTotal)
3633       return SDValue();
3634 
3635   // This is only valid if all users are in a single function; we can't clone
3636   // the constant in general. The LLVM IR unnamed_addr allows merging
3637   // constants, but not cloning them.
3638   //
3639   // We could potentially allow cloning if we could prove all uses of the
3640   // constant in the current function don't care about the address, like
3641   // printf format strings. But that isn't implemented for now.
3642   if (!allUsersAreInFunction(GVar, &F))
3643     return SDValue();
3644 
3645   // We're going to inline this global. Pad it out if needed.
3646   if (RequiredPadding != 4) {
3647     StringRef S = CDAInit->getAsString();
3648 
3649     SmallVector<uint8_t,16> V(S.size());
3650     std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
3651     while (RequiredPadding--)
3652       V.push_back(0);
3653     Init = ConstantDataArray::get(*DAG.getContext(), V);
3654   }
3655 
3656   auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
3657   SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4));
3658   if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
3659     AFI->markGlobalAsPromotedToConstantPool(GVar);
3660     AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() +
3661                                       PaddedSize - 4);
3662   }
3663   ++NumConstpoolPromoted;
3664   return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3665 }
3666 
3667 bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const {
3668   if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
3669     if (!(GV = GA->getBaseObject()))
3670       return false;
3671   if (const auto *V = dyn_cast<GlobalVariable>(GV))
3672     return V->isConstant();
3673   return isa<Function>(GV);
3674 }
3675 
3676 SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
3677                                               SelectionDAG &DAG) const {
3678   switch (Subtarget->getTargetTriple().getObjectFormat()) {
3679   default: llvm_unreachable("unknown object format");
3680   case Triple::COFF:
3681     return LowerGlobalAddressWindows(Op, DAG);
3682   case Triple::ELF:
3683     return LowerGlobalAddressELF(Op, DAG);
3684   case Triple::MachO:
3685     return LowerGlobalAddressDarwin(Op, DAG);
3686   }
3687 }
3688 
3689 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
3690                                                  SelectionDAG &DAG) const {
3691   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3692   SDLoc dl(Op);
3693   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3694   const TargetMachine &TM = getTargetMachine();
3695   bool IsRO = isReadOnly(GV);
3696 
3697   // promoteToConstantPool only if not generating XO text section
3698   if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly())
3699     if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl))
3700       return V;
3701 
3702   if (isPositionIndependent()) {
3703     bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
3704     SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
3705                                            UseGOT_PREL ? ARMII::MO_GOT : 0);
3706     SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3707     if (UseGOT_PREL)
3708       Result =
3709           DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3710                       MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3711     return Result;
3712   } else if (Subtarget->isROPI() && IsRO) {
3713     // PC-relative.
3714     SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
3715     SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3716     return Result;
3717   } else if (Subtarget->isRWPI() && !IsRO) {
3718     // SB-relative.
3719     SDValue RelAddr;
3720     if (Subtarget->useMovt()) {
3721       ++NumMovwMovt;
3722       SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
3723       RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
3724     } else { // use literal pool for address constant
3725       ARMConstantPoolValue *CPV =
3726         ARMConstantPoolConstant::Create(GV, ARMCP::SBREL);
3727       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3728       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3729       RelAddr = DAG.getLoad(
3730           PtrVT, dl, DAG.getEntryNode(), CPAddr,
3731           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3732     }
3733     SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
3734     SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
3735     return Result;
3736   }
3737 
3738   // If we have T2 ops, we can materialize the address directly via movt/movw
3739   // pair. This is always cheaper.
3740   if (Subtarget->useMovt()) {
3741     ++NumMovwMovt;
3742     // FIXME: Once remat is capable of dealing with instructions with register
3743     // operands, expand this into two nodes.
3744     return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
3745                        DAG.getTargetGlobalAddress(GV, dl, PtrVT));
3746   } else {
3747     SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4));
3748     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3749     return DAG.getLoad(
3750         PtrVT, dl, DAG.getEntryNode(), CPAddr,
3751         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3752   }
3753 }
3754 
3755 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
3756                                                     SelectionDAG &DAG) const {
3757   assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3758          "ROPI/RWPI not currently supported for Darwin");
3759   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3760   SDLoc dl(Op);
3761   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3762 
3763   if (Subtarget->useMovt())
3764     ++NumMovwMovt;
3765 
3766   // FIXME: Once remat is capable of dealing with instructions with register
3767   // operands, expand this into multiple nodes
3768   unsigned Wrapper =
3769       isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper;
3770 
3771   SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
3772   SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
3773 
3774   if (Subtarget->isGVIndirectSymbol(GV))
3775     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3776                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3777   return Result;
3778 }
3779 
3780 SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
3781                                                      SelectionDAG &DAG) const {
3782   assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
3783   assert(Subtarget->useMovt() &&
3784          "Windows on ARM expects to use movw/movt");
3785   assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3786          "ROPI/RWPI not currently supported for Windows");
3787 
3788   const TargetMachine &TM = getTargetMachine();
3789   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3790   ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;
3791   if (GV->hasDLLImportStorageClass())
3792     TargetFlags = ARMII::MO_DLLIMPORT;
3793   else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
3794     TargetFlags = ARMII::MO_COFFSTUB;
3795   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3796   SDValue Result;
3797   SDLoc DL(Op);
3798 
3799   ++NumMovwMovt;
3800 
3801   // FIXME: Once remat is capable of dealing with instructions with register
3802   // operands, expand this into two nodes.
3803   Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
3804                        DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0,
3805                                                   TargetFlags));
3806   if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
3807     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
3808                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3809   return Result;
3810 }
3811 
3812 SDValue
3813 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
3814   SDLoc dl(Op);
3815   SDValue Val = DAG.getConstant(0, dl, MVT::i32);
3816   return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
3817                      DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
3818                      Op.getOperand(1), Val);
3819 }
3820 
3821 SDValue
3822 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
3823   SDLoc dl(Op);
3824   return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
3825                      Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
3826 }
3827 
3828 SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
3829                                                       SelectionDAG &DAG) const {
3830   SDLoc dl(Op);
3831   return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
3832                      Op.getOperand(0));
3833 }
3834 
3835 SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
3836     SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const {
3837   unsigned IntNo =
3838       cast<ConstantSDNode>(
3839           Op.getOperand(Op.getOperand(0).getValueType() == MVT::Other))
3840           ->getZExtValue();
3841   switch (IntNo) {
3842     default:
3843       return SDValue();  // Don't custom lower most intrinsics.
3844     case Intrinsic::arm_gnu_eabi_mcount: {
3845       MachineFunction &MF = DAG.getMachineFunction();
3846       EVT PtrVT = getPointerTy(DAG.getDataLayout());
3847       SDLoc dl(Op);
3848       SDValue Chain = Op.getOperand(0);
3849       // call "\01__gnu_mcount_nc"
3850       const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
3851       const uint32_t *Mask =
3852           ARI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C);
3853       assert(Mask && "Missing call preserved mask for calling convention");
3854       // Mark LR an implicit live-in.
3855       unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
3856       SDValue ReturnAddress =
3857           DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);
3858       constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue};
3859       SDValue Callee =
3860           DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0);
3861       SDValue RegisterMask = DAG.getRegisterMask(Mask);
3862       if (Subtarget->isThumb())
3863         return SDValue(
3864             DAG.getMachineNode(
3865                 ARM::tBL_PUSHLR, dl, ResultTys,
3866                 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT),
3867                  DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}),
3868             0);
3869       return SDValue(
3870           DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys,
3871                              {ReturnAddress, Callee, RegisterMask, Chain}),
3872           0);
3873     }
3874   }
3875 }
3876 
3877 SDValue
3878 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
3879                                           const ARMSubtarget *Subtarget) const {
3880   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3881   SDLoc dl(Op);
3882   switch (IntNo) {
3883   default: return SDValue();    // Don't custom lower most intrinsics.
3884   case Intrinsic::thread_pointer: {
3885     EVT PtrVT = getPointerTy(DAG.getDataLayout());
3886     return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
3887   }
3888   case Intrinsic::arm_cls: {
3889     const SDValue &Operand = Op.getOperand(1);
3890     const EVT VTy = Op.getValueType();
3891     SDValue SRA =
3892         DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy));
3893     SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand);
3894     SDValue SHL =
3895         DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy));
3896     SDValue OR =
3897         DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy));
3898     SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR);
3899     return Result;
3900   }
3901   case Intrinsic::arm_cls64: {
3902     // cls(x) = if cls(hi(x)) != 31 then cls(hi(x))
3903     //          else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x)))
3904     const SDValue &Operand = Op.getOperand(1);
3905     const EVT VTy = Op.getValueType();
3906 
3907     SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand,
3908                              DAG.getConstant(1, dl, VTy));
3909     SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand,
3910                              DAG.getConstant(0, dl, VTy));
3911     SDValue Constant0 = DAG.getConstant(0, dl, VTy);
3912     SDValue Constant1 = DAG.getConstant(1, dl, VTy);
3913     SDValue Constant31 = DAG.getConstant(31, dl, VTy);
3914     SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31);
3915     SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi);
3916     SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1);
3917     SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1);
3918     SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi);
3919     SDValue CheckLo =
3920         DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ);
3921     SDValue HiIsZero =
3922         DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ);
3923     SDValue AdjustedLo =
3924         DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy));
3925     SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo);
3926     SDValue Result =
3927         DAG.getSelect(dl, VTy, CheckLo,
3928                       DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi);
3929     return Result;
3930   }
3931   case Intrinsic::eh_sjlj_lsda: {
3932     MachineFunction &MF = DAG.getMachineFunction();
3933     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3934     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3935     EVT PtrVT = getPointerTy(DAG.getDataLayout());
3936     SDValue CPAddr;
3937     bool IsPositionIndependent = isPositionIndependent();
3938     unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
3939     ARMConstantPoolValue *CPV =
3940       ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
3941                                       ARMCP::CPLSDA, PCAdj);
3942     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3943     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3944     SDValue Result = DAG.getLoad(
3945         PtrVT, dl, DAG.getEntryNode(), CPAddr,
3946         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3947 
3948     if (IsPositionIndependent) {
3949       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3950       Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
3951     }
3952     return Result;
3953   }
3954   case Intrinsic::arm_neon_vabs:
3955     return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(),
3956                         Op.getOperand(1));
3957   case Intrinsic::arm_neon_vmulls:
3958   case Intrinsic::arm_neon_vmullu: {
3959     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
3960       ? ARMISD::VMULLs : ARMISD::VMULLu;
3961     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3962                        Op.getOperand(1), Op.getOperand(2));
3963   }
3964   case Intrinsic::arm_neon_vminnm:
3965   case Intrinsic::arm_neon_vmaxnm: {
3966     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
3967       ? ISD::FMINNUM : ISD::FMAXNUM;
3968     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3969                        Op.getOperand(1), Op.getOperand(2));
3970   }
3971   case Intrinsic::arm_neon_vminu:
3972   case Intrinsic::arm_neon_vmaxu: {
3973     if (Op.getValueType().isFloatingPoint())
3974       return SDValue();
3975     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
3976       ? ISD::UMIN : ISD::UMAX;
3977     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3978                          Op.getOperand(1), Op.getOperand(2));
3979   }
3980   case Intrinsic::arm_neon_vmins:
3981   case Intrinsic::arm_neon_vmaxs: {
3982     // v{min,max}s is overloaded between signed integers and floats.
3983     if (!Op.getValueType().isFloatingPoint()) {
3984       unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
3985         ? ISD::SMIN : ISD::SMAX;
3986       return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3987                          Op.getOperand(1), Op.getOperand(2));
3988     }
3989     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
3990       ? ISD::FMINIMUM : ISD::FMAXIMUM;
3991     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3992                        Op.getOperand(1), Op.getOperand(2));
3993   }
3994   case Intrinsic::arm_neon_vtbl1:
3995     return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(),
3996                        Op.getOperand(1), Op.getOperand(2));
3997   case Intrinsic::arm_neon_vtbl2:
3998     return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
3999                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4000   case Intrinsic::arm_mve_pred_i2v:
4001   case Intrinsic::arm_mve_pred_v2i:
4002     return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(),
4003                        Op.getOperand(1));
4004   case Intrinsic::arm_mve_vreinterpretq:
4005     return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(),
4006                        Op.getOperand(1));
4007   case Intrinsic::arm_mve_lsll:
4008     return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(),
4009                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4010   case Intrinsic::arm_mve_asrl:
4011     return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(),
4012                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4013   }
4014 }
4015 
4016 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
4017                                  const ARMSubtarget *Subtarget) {
4018   SDLoc dl(Op);
4019   ConstantSDNode *SSIDNode = cast<ConstantSDNode>(Op.getOperand(2));
4020   auto SSID = static_cast<SyncScope::ID>(SSIDNode->getZExtValue());
4021   if (SSID == SyncScope::SingleThread)
4022     return Op;
4023 
4024   if (!Subtarget->hasDataBarrier()) {
4025     // Some ARMv6 cpus can support data barriers with an mcr instruction.
4026     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
4027     // here.
4028     assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
4029            "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
4030     return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
4031                        DAG.getConstant(0, dl, MVT::i32));
4032   }
4033 
4034   ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1));
4035   AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue());
4036   ARM_MB::MemBOpt Domain = ARM_MB::ISH;
4037   if (Subtarget->isMClass()) {
4038     // Only a full system barrier exists in the M-class architectures.
4039     Domain = ARM_MB::SY;
4040   } else if (Subtarget->preferISHSTBarriers() &&
4041              Ord == AtomicOrdering::Release) {
4042     // Swift happens to implement ISHST barriers in a way that's compatible with
4043     // Release semantics but weaker than ISH so we'd be fools not to use
4044     // it. Beware: other processors probably don't!
4045     Domain = ARM_MB::ISHST;
4046   }
4047 
4048   return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
4049                      DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
4050                      DAG.getConstant(Domain, dl, MVT::i32));
4051 }
4052 
4053 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
4054                              const ARMSubtarget *Subtarget) {
4055   // ARM pre v5TE and Thumb1 does not have preload instructions.
4056   if (!(Subtarget->isThumb2() ||
4057         (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
4058     // Just preserve the chain.
4059     return Op.getOperand(0);
4060 
4061   SDLoc dl(Op);
4062   unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
4063   if (!isRead &&
4064       (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
4065     // ARMv7 with MP extension has PLDW.
4066     return Op.getOperand(0);
4067 
4068   unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
4069   if (Subtarget->isThumb()) {
4070     // Invert the bits.
4071     isRead = ~isRead & 1;
4072     isData = ~isData & 1;
4073   }
4074 
4075   return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
4076                      Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
4077                      DAG.getConstant(isData, dl, MVT::i32));
4078 }
4079 
4080 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
4081   MachineFunction &MF = DAG.getMachineFunction();
4082   ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
4083 
4084   // vastart just stores the address of the VarArgsFrameIndex slot into the
4085   // memory location argument.
4086   SDLoc dl(Op);
4087   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
4088   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
4089   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
4090   return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
4091                       MachinePointerInfo(SV));
4092 }
4093 
4094 SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
4095                                                 CCValAssign &NextVA,
4096                                                 SDValue &Root,
4097                                                 SelectionDAG &DAG,
4098                                                 const SDLoc &dl) const {
4099   MachineFunction &MF = DAG.getMachineFunction();
4100   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4101 
4102   const TargetRegisterClass *RC;
4103   if (AFI->isThumb1OnlyFunction())
4104     RC = &ARM::tGPRRegClass;
4105   else
4106     RC = &ARM::GPRRegClass;
4107 
4108   // Transform the arguments stored in physical registers into virtual ones.
4109   unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
4110   SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4111 
4112   SDValue ArgValue2;
4113   if (NextVA.isMemLoc()) {
4114     MachineFrameInfo &MFI = MF.getFrameInfo();
4115     int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
4116 
4117     // Create load node to retrieve arguments from the stack.
4118     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4119     ArgValue2 = DAG.getLoad(
4120         MVT::i32, dl, Root, FIN,
4121         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
4122   } else {
4123     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
4124     ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4125   }
4126   if (!Subtarget->isLittle())
4127     std::swap (ArgValue, ArgValue2);
4128   return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
4129 }
4130 
4131 // The remaining GPRs hold either the beginning of variable-argument
4132 // data, or the beginning of an aggregate passed by value (usually
4133 // byval).  Either way, we allocate stack slots adjacent to the data
4134 // provided by our caller, and store the unallocated registers there.
4135 // If this is a variadic function, the va_list pointer will begin with
4136 // these values; otherwise, this reassembles a (byval) structure that
4137 // was split between registers and memory.
4138 // Return: The frame index registers were stored into.
4139 int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
4140                                       const SDLoc &dl, SDValue &Chain,
4141                                       const Value *OrigArg,
4142                                       unsigned InRegsParamRecordIdx,
4143                                       int ArgOffset, unsigned ArgSize) const {
4144   // Currently, two use-cases possible:
4145   // Case #1. Non-var-args function, and we meet first byval parameter.
4146   //          Setup first unallocated register as first byval register;
4147   //          eat all remained registers
4148   //          (these two actions are performed by HandleByVal method).
4149   //          Then, here, we initialize stack frame with
4150   //          "store-reg" instructions.
4151   // Case #2. Var-args function, that doesn't contain byval parameters.
4152   //          The same: eat all remained unallocated registers,
4153   //          initialize stack frame.
4154 
4155   MachineFunction &MF = DAG.getMachineFunction();
4156   MachineFrameInfo &MFI = MF.getFrameInfo();
4157   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4158   unsigned RBegin, REnd;
4159   if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
4160     CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
4161   } else {
4162     unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4163     RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
4164     REnd = ARM::R4;
4165   }
4166 
4167   if (REnd != RBegin)
4168     ArgOffset = -4 * (ARM::R4 - RBegin);
4169 
4170   auto PtrVT = getPointerTy(DAG.getDataLayout());
4171   int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
4172   SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
4173 
4174   SmallVector<SDValue, 4> MemOps;
4175   const TargetRegisterClass *RC =
4176       AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
4177 
4178   for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
4179     unsigned VReg = MF.addLiveIn(Reg, RC);
4180     SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
4181     SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
4182                                  MachinePointerInfo(OrigArg, 4 * i));
4183     MemOps.push_back(Store);
4184     FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
4185   }
4186 
4187   if (!MemOps.empty())
4188     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
4189   return FrameIndex;
4190 }
4191 
4192 // Setup stack frame, the va_list pointer will start from.
4193 void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
4194                                              const SDLoc &dl, SDValue &Chain,
4195                                              unsigned ArgOffset,
4196                                              unsigned TotalArgRegsSaveSize,
4197                                              bool ForceMutable) const {
4198   MachineFunction &MF = DAG.getMachineFunction();
4199   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4200 
4201   // Try to store any remaining integer argument regs
4202   // to their spots on the stack so that they may be loaded by dereferencing
4203   // the result of va_next.
4204   // If there is no regs to be stored, just point address after last
4205   // argument passed via stack.
4206   int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr,
4207                                   CCInfo.getInRegsParamsCount(),
4208                                   CCInfo.getNextStackOffset(),
4209                                   std::max(4U, TotalArgRegsSaveSize));
4210   AFI->setVarArgsFrameIndex(FrameIndex);
4211 }
4212 
4213 bool ARMTargetLowering::splitValueIntoRegisterParts(
4214     SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
4215     unsigned NumParts, MVT PartVT, Optional<CallingConv::ID> CC) const {
4216   bool IsABIRegCopy = CC.hasValue();
4217   EVT ValueVT = Val.getValueType();
4218   if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) &&
4219       PartVT == MVT::f32) {
4220     unsigned ValueBits = ValueVT.getSizeInBits();
4221     unsigned PartBits = PartVT.getSizeInBits();
4222     Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
4223     Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
4224     Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
4225     Parts[0] = Val;
4226     return true;
4227   }
4228   return false;
4229 }
4230 
4231 SDValue ARMTargetLowering::joinRegisterPartsIntoValue(
4232     SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
4233     MVT PartVT, EVT ValueVT, Optional<CallingConv::ID> CC) const {
4234   bool IsABIRegCopy = CC.hasValue();
4235   if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) &&
4236       PartVT == MVT::f32) {
4237     unsigned ValueBits = ValueVT.getSizeInBits();
4238     unsigned PartBits = PartVT.getSizeInBits();
4239     SDValue Val = Parts[0];
4240 
4241     Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
4242     Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
4243     Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
4244     return Val;
4245   }
4246   return SDValue();
4247 }
4248 
4249 SDValue ARMTargetLowering::LowerFormalArguments(
4250     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4251     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4252     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4253   MachineFunction &MF = DAG.getMachineFunction();
4254   MachineFrameInfo &MFI = MF.getFrameInfo();
4255 
4256   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4257 
4258   // Assign locations to all of the incoming arguments.
4259   SmallVector<CCValAssign, 16> ArgLocs;
4260   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
4261                  *DAG.getContext());
4262   CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
4263 
4264   SmallVector<SDValue, 16> ArgValues;
4265   SDValue ArgValue;
4266   Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin();
4267   unsigned CurArgIdx = 0;
4268 
4269   // Initially ArgRegsSaveSize is zero.
4270   // Then we increase this value each time we meet byval parameter.
4271   // We also increase this value in case of varargs function.
4272   AFI->setArgRegsSaveSize(0);
4273 
4274   // Calculate the amount of stack space that we need to allocate to store
4275   // byval and variadic arguments that are passed in registers.
4276   // We need to know this before we allocate the first byval or variadic
4277   // argument, as they will be allocated a stack slot below the CFA (Canonical
4278   // Frame Address, the stack pointer at entry to the function).
4279   unsigned ArgRegBegin = ARM::R4;
4280   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4281     if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
4282       break;
4283 
4284     CCValAssign &VA = ArgLocs[i];
4285     unsigned Index = VA.getValNo();
4286     ISD::ArgFlagsTy Flags = Ins[Index].Flags;
4287     if (!Flags.isByVal())
4288       continue;
4289 
4290     assert(VA.isMemLoc() && "unexpected byval pointer in reg");
4291     unsigned RBegin, REnd;
4292     CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
4293     ArgRegBegin = std::min(ArgRegBegin, RBegin);
4294 
4295     CCInfo.nextInRegsParam();
4296   }
4297   CCInfo.rewindByValRegsInfo();
4298 
4299   int lastInsIndex = -1;
4300   if (isVarArg && MFI.hasVAStart()) {
4301     unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4302     if (RegIdx != array_lengthof(GPRArgRegs))
4303       ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
4304   }
4305 
4306   unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
4307   AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
4308   auto PtrVT = getPointerTy(DAG.getDataLayout());
4309 
4310   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4311     CCValAssign &VA = ArgLocs[i];
4312     if (Ins[VA.getValNo()].isOrigArg()) {
4313       std::advance(CurOrigArg,
4314                    Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
4315       CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
4316     }
4317     // Arguments stored in registers.
4318     if (VA.isRegLoc()) {
4319       EVT RegVT = VA.getLocVT();
4320 
4321       if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
4322         // f64 and vector types are split up into multiple registers or
4323         // combinations of registers and stack slots.
4324         SDValue ArgValue1 =
4325             GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4326         VA = ArgLocs[++i]; // skip ahead to next loc
4327         SDValue ArgValue2;
4328         if (VA.isMemLoc()) {
4329           int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
4330           SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4331           ArgValue2 = DAG.getLoad(
4332               MVT::f64, dl, Chain, FIN,
4333               MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
4334         } else {
4335           ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4336         }
4337         ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
4338         ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4339                                ArgValue1, DAG.getIntPtrConstant(0, dl));
4340         ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4341                                ArgValue2, DAG.getIntPtrConstant(1, dl));
4342       } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
4343         ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4344       } else {
4345         const TargetRegisterClass *RC;
4346 
4347         if (RegVT == MVT::f16 || RegVT == MVT::bf16)
4348           RC = &ARM::HPRRegClass;
4349         else if (RegVT == MVT::f32)
4350           RC = &ARM::SPRRegClass;
4351         else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 ||
4352                  RegVT == MVT::v4bf16)
4353           RC = &ARM::DPRRegClass;
4354         else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 ||
4355                  RegVT == MVT::v8bf16)
4356           RC = &ARM::QPRRegClass;
4357         else if (RegVT == MVT::i32)
4358           RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
4359                                            : &ARM::GPRRegClass;
4360         else
4361           llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
4362 
4363         // Transform the arguments in physical registers into virtual ones.
4364         unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
4365         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
4366 
4367         // If this value is passed in r0 and has the returned attribute (e.g.
4368         // C++ 'structors), record this fact for later use.
4369         if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) {
4370           AFI->setPreservesR0();
4371         }
4372       }
4373 
4374       // If this is an 8 or 16-bit value, it is really passed promoted
4375       // to 32 bits.  Insert an assert[sz]ext to capture this, then
4376       // truncate to the right size.
4377       switch (VA.getLocInfo()) {
4378       default: llvm_unreachable("Unknown loc info!");
4379       case CCValAssign::Full: break;
4380       case CCValAssign::BCvt:
4381         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
4382         break;
4383       case CCValAssign::SExt:
4384         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
4385                                DAG.getValueType(VA.getValVT()));
4386         ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
4387         break;
4388       case CCValAssign::ZExt:
4389         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
4390                                DAG.getValueType(VA.getValVT()));
4391         ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
4392         break;
4393       }
4394 
4395       // f16 arguments have their size extended to 4 bytes and passed as if they
4396       // had been copied to the LSBs of a 32-bit register.
4397       // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
4398       if (VA.needsCustom() &&
4399           (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
4400         ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue);
4401 
4402       InVals.push_back(ArgValue);
4403     } else { // VA.isRegLoc()
4404       // sanity check
4405       assert(VA.isMemLoc());
4406       assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
4407 
4408       int index = VA.getValNo();
4409 
4410       // Some Ins[] entries become multiple ArgLoc[] entries.
4411       // Process them only once.
4412       if (index != lastInsIndex)
4413         {
4414           ISD::ArgFlagsTy Flags = Ins[index].Flags;
4415           // FIXME: For now, all byval parameter objects are marked mutable.
4416           // This can be changed with more analysis.
4417           // In case of tail call optimization mark all arguments mutable.
4418           // Since they could be overwritten by lowering of arguments in case of
4419           // a tail call.
4420           if (Flags.isByVal()) {
4421             assert(Ins[index].isOrigArg() &&
4422                    "Byval arguments cannot be implicit");
4423             unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
4424 
4425             int FrameIndex = StoreByValRegs(
4426                 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
4427                 VA.getLocMemOffset(), Flags.getByValSize());
4428             InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
4429             CCInfo.nextInRegsParam();
4430           } else {
4431             unsigned FIOffset = VA.getLocMemOffset();
4432             int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
4433                                            FIOffset, true);
4434 
4435             // Create load nodes to retrieve arguments from the stack.
4436             SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4437             InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
4438                                          MachinePointerInfo::getFixedStack(
4439                                              DAG.getMachineFunction(), FI)));
4440           }
4441           lastInsIndex = index;
4442         }
4443     }
4444   }
4445 
4446   // varargs
4447   if (isVarArg && MFI.hasVAStart()) {
4448     VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getNextStackOffset(),
4449                          TotalArgRegsSaveSize);
4450     if (AFI->isCmseNSEntryFunction()) {
4451       DiagnosticInfoUnsupported Diag(
4452           DAG.getMachineFunction().getFunction(),
4453           "secure entry function must not be variadic", dl.getDebugLoc());
4454       DAG.getContext()->diagnose(Diag);
4455     }
4456   }
4457 
4458   AFI->setArgumentStackSize(CCInfo.getNextStackOffset());
4459 
4460   if (CCInfo.getNextStackOffset() > 0 && AFI->isCmseNSEntryFunction()) {
4461     DiagnosticInfoUnsupported Diag(
4462         DAG.getMachineFunction().getFunction(),
4463         "secure entry function requires arguments on stack", dl.getDebugLoc());
4464     DAG.getContext()->diagnose(Diag);
4465   }
4466 
4467   return Chain;
4468 }
4469 
4470 /// isFloatingPointZero - Return true if this is +0.0.
4471 static bool isFloatingPointZero(SDValue Op) {
4472   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
4473     return CFP->getValueAPF().isPosZero();
4474   else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
4475     // Maybe this has already been legalized into the constant pool?
4476     if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
4477       SDValue WrapperOp = Op.getOperand(1).getOperand(0);
4478       if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))
4479         if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
4480           return CFP->getValueAPF().isPosZero();
4481     }
4482   } else if (Op->getOpcode() == ISD::BITCAST &&
4483              Op->getValueType(0) == MVT::f64) {
4484     // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
4485     // created by LowerConstantFP().
4486     SDValue BitcastOp = Op->getOperand(0);
4487     if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
4488         isNullConstant(BitcastOp->getOperand(0)))
4489       return true;
4490   }
4491   return false;
4492 }
4493 
4494 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for
4495 /// the given operands.
4496 SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
4497                                      SDValue &ARMcc, SelectionDAG &DAG,
4498                                      const SDLoc &dl) const {
4499   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4500     unsigned C = RHSC->getZExtValue();
4501     if (!isLegalICmpImmediate((int32_t)C)) {
4502       // Constant does not fit, try adjusting it by one.
4503       switch (CC) {
4504       default: break;
4505       case ISD::SETLT:
4506       case ISD::SETGE:
4507         if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
4508           CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4509           RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4510         }
4511         break;
4512       case ISD::SETULT:
4513       case ISD::SETUGE:
4514         if (C != 0 && isLegalICmpImmediate(C-1)) {
4515           CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4516           RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4517         }
4518         break;
4519       case ISD::SETLE:
4520       case ISD::SETGT:
4521         if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
4522           CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4523           RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4524         }
4525         break;
4526       case ISD::SETULE:
4527       case ISD::SETUGT:
4528         if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
4529           CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4530           RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4531         }
4532         break;
4533       }
4534     }
4535   } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) &&
4536              (ARM_AM::getShiftOpcForNode(RHS.getOpcode()) == ARM_AM::no_shift)) {
4537     // In ARM and Thumb-2, the compare instructions can shift their second
4538     // operand.
4539     CC = ISD::getSetCCSwappedOperands(CC);
4540     std::swap(LHS, RHS);
4541   }
4542 
4543   // Thumb1 has very limited immediate modes, so turning an "and" into a
4544   // shift can save multiple instructions.
4545   //
4546   // If we have (x & C1), and C1 is an appropriate mask, we can transform it
4547   // into "((x << n) >> n)".  But that isn't necessarily profitable on its
4548   // own. If it's the operand to an unsigned comparison with an immediate,
4549   // we can eliminate one of the shifts: we transform
4550   // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)".
4551   //
4552   // We avoid transforming cases which aren't profitable due to encoding
4553   // details:
4554   //
4555   // 1. C2 fits into the immediate field of a cmp, and the transformed version
4556   // would not; in that case, we're essentially trading one immediate load for
4557   // another.
4558   // 2. C1 is 255 or 65535, so we can use uxtb or uxth.
4559   // 3. C2 is zero; we have other code for this special case.
4560   //
4561   // FIXME: Figure out profitability for Thumb2; we usually can't save an
4562   // instruction, since the AND is always one instruction anyway, but we could
4563   // use narrow instructions in some cases.
4564   if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND &&
4565       LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4566       LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&
4567       !isSignedIntSetCC(CC)) {
4568     unsigned Mask = cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue();
4569     auto *RHSC = cast<ConstantSDNode>(RHS.getNode());
4570     uint64_t RHSV = RHSC->getZExtValue();
4571     if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {
4572       unsigned ShiftBits = countLeadingZeros(Mask);
4573       if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) {
4574         SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32);
4575         LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt);
4576         RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32);
4577       }
4578     }
4579   }
4580 
4581   // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a
4582   // single "lsls x, c+1".  The shift sets the "C" and "Z" flags the same
4583   // way a cmp would.
4584   // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and
4585   // some tweaks to the heuristics for the previous and->shift transform.
4586   // FIXME: Optimize cases where the LHS isn't a shift.
4587   if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL &&
4588       isa<ConstantSDNode>(RHS) &&
4589       cast<ConstantSDNode>(RHS)->getZExtValue() == 0x80000000U &&
4590       CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4591       cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() < 31) {
4592     unsigned ShiftAmt =
4593       cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() + 1;
4594     SDValue Shift = DAG.getNode(ARMISD::LSLS, dl,
4595                                 DAG.getVTList(MVT::i32, MVT::i32),
4596                                 LHS.getOperand(0),
4597                                 DAG.getConstant(ShiftAmt, dl, MVT::i32));
4598     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
4599                                      Shift.getValue(1), SDValue());
4600     ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32);
4601     return Chain.getValue(1);
4602   }
4603 
4604   ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
4605 
4606   // If the RHS is a constant zero then the V (overflow) flag will never be
4607   // set. This can allow us to simplify GE to PL or LT to MI, which can be
4608   // simpler for other passes (like the peephole optimiser) to deal with.
4609   if (isNullConstant(RHS)) {
4610     switch (CondCode) {
4611       default: break;
4612       case ARMCC::GE:
4613         CondCode = ARMCC::PL;
4614         break;
4615       case ARMCC::LT:
4616         CondCode = ARMCC::MI;
4617         break;
4618     }
4619   }
4620 
4621   ARMISD::NodeType CompareType;
4622   switch (CondCode) {
4623   default:
4624     CompareType = ARMISD::CMP;
4625     break;
4626   case ARMCC::EQ:
4627   case ARMCC::NE:
4628     // Uses only Z Flag
4629     CompareType = ARMISD::CMPZ;
4630     break;
4631   }
4632   ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4633   return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);
4634 }
4635 
4636 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
4637 SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
4638                                      SelectionDAG &DAG, const SDLoc &dl,
4639                                      bool Signaling) const {
4640   assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
4641   SDValue Cmp;
4642   if (!isFloatingPointZero(RHS))
4643     Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP,
4644                       dl, MVT::Glue, LHS, RHS);
4645   else
4646     Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0,
4647                       dl, MVT::Glue, LHS);
4648   return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
4649 }
4650 
4651 /// duplicateCmp - Glue values can have only one use, so this function
4652 /// duplicates a comparison node.
4653 SDValue
4654 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
4655   unsigned Opc = Cmp.getOpcode();
4656   SDLoc DL(Cmp);
4657   if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
4658     return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
4659 
4660   assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
4661   Cmp = Cmp.getOperand(0);
4662   Opc = Cmp.getOpcode();
4663   if (Opc == ARMISD::CMPFP)
4664     Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
4665   else {
4666     assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
4667     Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
4668   }
4669   return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
4670 }
4671 
4672 // This function returns three things: the arithmetic computation itself
4673 // (Value), a comparison (OverflowCmp), and a condition code (ARMcc).  The
4674 // comparison and the condition code define the case in which the arithmetic
4675 // computation *does not* overflow.
4676 std::pair<SDValue, SDValue>
4677 ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
4678                                  SDValue &ARMcc) const {
4679   assert(Op.getValueType() == MVT::i32 &&  "Unsupported value type");
4680 
4681   SDValue Value, OverflowCmp;
4682   SDValue LHS = Op.getOperand(0);
4683   SDValue RHS = Op.getOperand(1);
4684   SDLoc dl(Op);
4685 
4686   // FIXME: We are currently always generating CMPs because we don't support
4687   // generating CMN through the backend. This is not as good as the natural
4688   // CMP case because it causes a register dependency and cannot be folded
4689   // later.
4690 
4691   switch (Op.getOpcode()) {
4692   default:
4693     llvm_unreachable("Unknown overflow instruction!");
4694   case ISD::SADDO:
4695     ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4696     Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
4697     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
4698     break;
4699   case ISD::UADDO:
4700     ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4701     // We use ADDC here to correspond to its use in LowerUnsignedALUO.
4702     // We do not use it in the USUBO case as Value may not be used.
4703     Value = DAG.getNode(ARMISD::ADDC, dl,
4704                         DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)
4705                 .getValue(0);
4706     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
4707     break;
4708   case ISD::SSUBO:
4709     ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4710     Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4711     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
4712     break;
4713   case ISD::USUBO:
4714     ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4715     Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4716     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
4717     break;
4718   case ISD::UMULO:
4719     // We generate a UMUL_LOHI and then check if the high word is 0.
4720     ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4721     Value = DAG.getNode(ISD::UMUL_LOHI, dl,
4722                         DAG.getVTList(Op.getValueType(), Op.getValueType()),
4723                         LHS, RHS);
4724     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
4725                               DAG.getConstant(0, dl, MVT::i32));
4726     Value = Value.getValue(0); // We only want the low 32 bits for the result.
4727     break;
4728   case ISD::SMULO:
4729     // We generate a SMUL_LOHI and then check if all the bits of the high word
4730     // are the same as the sign bit of the low word.
4731     ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4732     Value = DAG.getNode(ISD::SMUL_LOHI, dl,
4733                         DAG.getVTList(Op.getValueType(), Op.getValueType()),
4734                         LHS, RHS);
4735     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
4736                               DAG.getNode(ISD::SRA, dl, Op.getValueType(),
4737                                           Value.getValue(0),
4738                                           DAG.getConstant(31, dl, MVT::i32)));
4739     Value = Value.getValue(0); // We only want the low 32 bits for the result.
4740     break;
4741   } // switch (...)
4742 
4743   return std::make_pair(Value, OverflowCmp);
4744 }
4745 
4746 SDValue
4747 ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const {
4748   // Let legalize expand this if it isn't a legal type yet.
4749   if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4750     return SDValue();
4751 
4752   SDValue Value, OverflowCmp;
4753   SDValue ARMcc;
4754   std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
4755   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4756   SDLoc dl(Op);
4757   // We use 0 and 1 as false and true values.
4758   SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4759   SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4760   EVT VT = Op.getValueType();
4761 
4762   SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal,
4763                                  ARMcc, CCR, OverflowCmp);
4764 
4765   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
4766   return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4767 }
4768 
4769 static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry,
4770                                               SelectionDAG &DAG) {
4771   SDLoc DL(BoolCarry);
4772   EVT CarryVT = BoolCarry.getValueType();
4773 
4774   // This converts the boolean value carry into the carry flag by doing
4775   // ARMISD::SUBC Carry, 1
4776   SDValue Carry = DAG.getNode(ARMISD::SUBC, DL,
4777                               DAG.getVTList(CarryVT, MVT::i32),
4778                               BoolCarry, DAG.getConstant(1, DL, CarryVT));
4779   return Carry.getValue(1);
4780 }
4781 
4782 static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT,
4783                                               SelectionDAG &DAG) {
4784   SDLoc DL(Flags);
4785 
4786   // Now convert the carry flag into a boolean carry. We do this
4787   // using ARMISD:ADDE 0, 0, Carry
4788   return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
4789                      DAG.getConstant(0, DL, MVT::i32),
4790                      DAG.getConstant(0, DL, MVT::i32), Flags);
4791 }
4792 
4793 SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
4794                                              SelectionDAG &DAG) const {
4795   // Let legalize expand this if it isn't a legal type yet.
4796   if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4797     return SDValue();
4798 
4799   SDValue LHS = Op.getOperand(0);
4800   SDValue RHS = Op.getOperand(1);
4801   SDLoc dl(Op);
4802 
4803   EVT VT = Op.getValueType();
4804   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
4805   SDValue Value;
4806   SDValue Overflow;
4807   switch (Op.getOpcode()) {
4808   default:
4809     llvm_unreachable("Unknown overflow instruction!");
4810   case ISD::UADDO:
4811     Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS);
4812     // Convert the carry flag into a boolean value.
4813     Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4814     break;
4815   case ISD::USUBO: {
4816     Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS);
4817     // Convert the carry flag into a boolean value.
4818     Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4819     // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow
4820     // value. So compute 1 - C.
4821     Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32,
4822                            DAG.getConstant(1, dl, MVT::i32), Overflow);
4823     break;
4824   }
4825   }
4826 
4827   return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4828 }
4829 
4830 static SDValue LowerSADDSUBSAT(SDValue Op, SelectionDAG &DAG,
4831                                const ARMSubtarget *Subtarget) {
4832   EVT VT = Op.getValueType();
4833   if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
4834     return SDValue();
4835   if (!VT.isSimple())
4836     return SDValue();
4837 
4838   unsigned NewOpcode;
4839   bool IsAdd = Op->getOpcode() == ISD::SADDSAT;
4840   switch (VT.getSimpleVT().SimpleTy) {
4841   default:
4842     return SDValue();
4843   case MVT::i8:
4844     NewOpcode = IsAdd ? ARMISD::QADD8b : ARMISD::QSUB8b;
4845     break;
4846   case MVT::i16:
4847     NewOpcode = IsAdd ? ARMISD::QADD16b : ARMISD::QSUB16b;
4848     break;
4849   }
4850 
4851   SDLoc dl(Op);
4852   SDValue Add =
4853       DAG.getNode(NewOpcode, dl, MVT::i32,
4854                   DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32),
4855                   DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32));
4856   return DAG.getNode(ISD::TRUNCATE, dl, VT, Add);
4857 }
4858 
4859 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
4860   SDValue Cond = Op.getOperand(0);
4861   SDValue SelectTrue = Op.getOperand(1);
4862   SDValue SelectFalse = Op.getOperand(2);
4863   SDLoc dl(Op);
4864   unsigned Opc = Cond.getOpcode();
4865 
4866   if (Cond.getResNo() == 1 &&
4867       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
4868        Opc == ISD::USUBO)) {
4869     if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
4870       return SDValue();
4871 
4872     SDValue Value, OverflowCmp;
4873     SDValue ARMcc;
4874     std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
4875     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4876     EVT VT = Op.getValueType();
4877 
4878     return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR,
4879                    OverflowCmp, DAG);
4880   }
4881 
4882   // Convert:
4883   //
4884   //   (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
4885   //   (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
4886   //
4887   if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
4888     const ConstantSDNode *CMOVTrue =
4889       dyn_cast<ConstantSDNode>(Cond.getOperand(0));
4890     const ConstantSDNode *CMOVFalse =
4891       dyn_cast<ConstantSDNode>(Cond.getOperand(1));
4892 
4893     if (CMOVTrue && CMOVFalse) {
4894       unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
4895       unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
4896 
4897       SDValue True;
4898       SDValue False;
4899       if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
4900         True = SelectTrue;
4901         False = SelectFalse;
4902       } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
4903         True = SelectFalse;
4904         False = SelectTrue;
4905       }
4906 
4907       if (True.getNode() && False.getNode()) {
4908         EVT VT = Op.getValueType();
4909         SDValue ARMcc = Cond.getOperand(2);
4910         SDValue CCR = Cond.getOperand(3);
4911         SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
4912         assert(True.getValueType() == VT);
4913         return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG);
4914       }
4915     }
4916   }
4917 
4918   // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
4919   // undefined bits before doing a full-word comparison with zero.
4920   Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
4921                      DAG.getConstant(1, dl, Cond.getValueType()));
4922 
4923   return DAG.getSelectCC(dl, Cond,
4924                          DAG.getConstant(0, dl, Cond.getValueType()),
4925                          SelectTrue, SelectFalse, ISD::SETNE);
4926 }
4927 
4928 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
4929                                  bool &swpCmpOps, bool &swpVselOps) {
4930   // Start by selecting the GE condition code for opcodes that return true for
4931   // 'equality'
4932   if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
4933       CC == ISD::SETULE || CC == ISD::SETGE  || CC == ISD::SETLE)
4934     CondCode = ARMCC::GE;
4935 
4936   // and GT for opcodes that return false for 'equality'.
4937   else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
4938            CC == ISD::SETULT || CC == ISD::SETGT  || CC == ISD::SETLT)
4939     CondCode = ARMCC::GT;
4940 
4941   // Since we are constrained to GE/GT, if the opcode contains 'less', we need
4942   // to swap the compare operands.
4943   if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
4944       CC == ISD::SETULT || CC == ISD::SETLE  || CC == ISD::SETLT)
4945     swpCmpOps = true;
4946 
4947   // Both GT and GE are ordered comparisons, and return false for 'unordered'.
4948   // If we have an unordered opcode, we need to swap the operands to the VSEL
4949   // instruction (effectively negating the condition).
4950   //
4951   // This also has the effect of swapping which one of 'less' or 'greater'
4952   // returns true, so we also swap the compare operands. It also switches
4953   // whether we return true for 'equality', so we compensate by picking the
4954   // opposite condition code to our original choice.
4955   if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
4956       CC == ISD::SETUGT) {
4957     swpCmpOps = !swpCmpOps;
4958     swpVselOps = !swpVselOps;
4959     CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
4960   }
4961 
4962   // 'ordered' is 'anything but unordered', so use the VS condition code and
4963   // swap the VSEL operands.
4964   if (CC == ISD::SETO) {
4965     CondCode = ARMCC::VS;
4966     swpVselOps = true;
4967   }
4968 
4969   // 'unordered or not equal' is 'anything but equal', so use the EQ condition
4970   // code and swap the VSEL operands. Also do this if we don't care about the
4971   // unordered case.
4972   if (CC == ISD::SETUNE || CC == ISD::SETNE) {
4973     CondCode = ARMCC::EQ;
4974     swpVselOps = true;
4975   }
4976 }
4977 
4978 SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
4979                                    SDValue TrueVal, SDValue ARMcc, SDValue CCR,
4980                                    SDValue Cmp, SelectionDAG &DAG) const {
4981   if (!Subtarget->hasFP64() && VT == MVT::f64) {
4982     FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,
4983                            DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
4984     TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,
4985                           DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
4986 
4987     SDValue TrueLow = TrueVal.getValue(0);
4988     SDValue TrueHigh = TrueVal.getValue(1);
4989     SDValue FalseLow = FalseVal.getValue(0);
4990     SDValue FalseHigh = FalseVal.getValue(1);
4991 
4992     SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
4993                               ARMcc, CCR, Cmp);
4994     SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
4995                                ARMcc, CCR, duplicateCmp(Cmp, DAG));
4996 
4997     return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
4998   } else {
4999     return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,
5000                        Cmp);
5001   }
5002 }
5003 
5004 static bool isGTorGE(ISD::CondCode CC) {
5005   return CC == ISD::SETGT || CC == ISD::SETGE;
5006 }
5007 
5008 static bool isLTorLE(ISD::CondCode CC) {
5009   return CC == ISD::SETLT || CC == ISD::SETLE;
5010 }
5011 
5012 // See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
5013 // All of these conditions (and their <= and >= counterparts) will do:
5014 //          x < k ? k : x
5015 //          x > k ? x : k
5016 //          k < x ? x : k
5017 //          k > x ? k : x
5018 static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
5019                             const SDValue TrueVal, const SDValue FalseVal,
5020                             const ISD::CondCode CC, const SDValue K) {
5021   return (isGTorGE(CC) &&
5022           ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
5023          (isLTorLE(CC) &&
5024           ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
5025 }
5026 
5027 // Check if two chained conditionals could be converted into SSAT or USAT.
5028 //
5029 // SSAT can replace a set of two conditional selectors that bound a number to an
5030 // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
5031 //
5032 //     x < -k ? -k : (x > k ? k : x)
5033 //     x < -k ? -k : (x < k ? x : k)
5034 //     x > -k ? (x > k ? k : x) : -k
5035 //     x < k ? (x < -k ? -k : x) : k
5036 //     etc.
5037 //
5038 // LLVM canonicalizes these to either a min(max()) or a max(min())
5039 // pattern. This function tries to match one of these and will return a SSAT
5040 // node if successful.
5041 //
5042 // USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1
5043 // is a power of 2.
5044 static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG) {
5045   EVT VT = Op.getValueType();
5046   SDValue V1 = Op.getOperand(0);
5047   SDValue K1 = Op.getOperand(1);
5048   SDValue TrueVal1 = Op.getOperand(2);
5049   SDValue FalseVal1 = Op.getOperand(3);
5050   ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5051 
5052   const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
5053   if (Op2.getOpcode() != ISD::SELECT_CC)
5054     return SDValue();
5055 
5056   SDValue V2 = Op2.getOperand(0);
5057   SDValue K2 = Op2.getOperand(1);
5058   SDValue TrueVal2 = Op2.getOperand(2);
5059   SDValue FalseVal2 = Op2.getOperand(3);
5060   ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
5061 
5062   SDValue V1Tmp = V1;
5063   SDValue V2Tmp = V2;
5064 
5065   // Check that the registers and the constants match a max(min()) or min(max())
5066   // pattern
5067   if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 ||
5068       K2 != FalseVal2 ||
5069       !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2))))
5070     return SDValue();
5071 
5072   // Check that the constant in the lower-bound check is
5073   // the opposite of the constant in the upper-bound check
5074   // in 1's complement.
5075   if (!isa<ConstantSDNode>(K1) || !isa<ConstantSDNode>(K2))
5076     return SDValue();
5077 
5078   int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue();
5079   int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue();
5080   int64_t PosVal = std::max(Val1, Val2);
5081   int64_t NegVal = std::min(Val1, Val2);
5082 
5083   if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) ||
5084       !isPowerOf2_64(PosVal + 1))
5085     return SDValue();
5086 
5087   // Handle the difference between USAT (unsigned) and SSAT (signed)
5088   // saturation
5089   // At this point, PosVal is guaranteed to be positive
5090   uint64_t K = PosVal;
5091   SDLoc dl(Op);
5092   if (Val1 == ~Val2)
5093     return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp,
5094                        DAG.getConstant(countTrailingOnes(K), dl, VT));
5095   if (NegVal == 0)
5096     return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp,
5097                        DAG.getConstant(countTrailingOnes(K), dl, VT));
5098 
5099   return SDValue();
5100 }
5101 
5102 // Check if a condition of the type x < k ? k : x can be converted into a
5103 // bit operation instead of conditional moves.
5104 // Currently this is allowed given:
5105 // - The conditions and values match up
5106 // - k is 0 or -1 (all ones)
5107 // This function will not check the last condition, thats up to the caller
5108 // It returns true if the transformation can be made, and in such case
5109 // returns x in V, and k in SatK.
5110 static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V,
5111                                          SDValue &SatK)
5112 {
5113   SDValue LHS = Op.getOperand(0);
5114   SDValue RHS = Op.getOperand(1);
5115   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5116   SDValue TrueVal = Op.getOperand(2);
5117   SDValue FalseVal = Op.getOperand(3);
5118 
5119   SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS)
5120                                                ? &RHS
5121                                                : nullptr;
5122 
5123   // No constant operation in comparison, early out
5124   if (!K)
5125     return false;
5126 
5127   SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
5128   V = (KTmp == TrueVal) ? FalseVal : TrueVal;
5129   SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
5130 
5131   // If the constant on left and right side, or variable on left and right,
5132   // does not match, early out
5133   if (*K != KTmp || V != VTmp)
5134     return false;
5135 
5136   if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
5137     SatK = *K;
5138     return true;
5139   }
5140 
5141   return false;
5142 }
5143 
5144 bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {
5145   if (VT == MVT::f32)
5146     return !Subtarget->hasVFP2Base();
5147   if (VT == MVT::f64)
5148     return !Subtarget->hasFP64();
5149   if (VT == MVT::f16)
5150     return !Subtarget->hasFullFP16();
5151   return false;
5152 }
5153 
5154 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
5155   EVT VT = Op.getValueType();
5156   SDLoc dl(Op);
5157 
5158   // Try to convert two saturating conditional selects into a single SSAT
5159   if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2())
5160     if (SDValue SatValue = LowerSaturatingConditional(Op, DAG))
5161       return SatValue;
5162 
5163   // Try to convert expressions of the form x < k ? k : x (and similar forms)
5164   // into more efficient bit operations, which is possible when k is 0 or -1
5165   // On ARM and Thumb-2 which have flexible operand 2 this will result in
5166   // single instructions. On Thumb the shift and the bit operation will be two
5167   // instructions.
5168   // Only allow this transformation on full-width (32-bit) operations
5169   SDValue LowerSatConstant;
5170   SDValue SatValue;
5171   if (VT == MVT::i32 &&
5172       isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
5173     SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
5174                                  DAG.getConstant(31, dl, VT));
5175     if (isNullConstant(LowerSatConstant)) {
5176       SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
5177                                       DAG.getAllOnesConstant(dl, VT));
5178       return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
5179     } else if (isAllOnesConstant(LowerSatConstant))
5180       return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
5181   }
5182 
5183   SDValue LHS = Op.getOperand(0);
5184   SDValue RHS = Op.getOperand(1);
5185   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5186   SDValue TrueVal = Op.getOperand(2);
5187   SDValue FalseVal = Op.getOperand(3);
5188   ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal);
5189   ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal);
5190 
5191   if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal &&
5192       LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) {
5193     unsigned TVal = CTVal->getZExtValue();
5194     unsigned FVal = CFVal->getZExtValue();
5195     unsigned Opcode = 0;
5196 
5197     if (TVal == ~FVal) {
5198       Opcode = ARMISD::CSINV;
5199     } else if (TVal == ~FVal + 1) {
5200       Opcode = ARMISD::CSNEG;
5201     } else if (TVal + 1 == FVal) {
5202       Opcode = ARMISD::CSINC;
5203     } else if (TVal == FVal + 1) {
5204       Opcode = ARMISD::CSINC;
5205       std::swap(TrueVal, FalseVal);
5206       std::swap(TVal, FVal);
5207       CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5208     }
5209 
5210     if (Opcode) {
5211       // If one of the constants is cheaper than another, materialise the
5212       // cheaper one and let the csel generate the other.
5213       if (Opcode != ARMISD::CSINC &&
5214           HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) {
5215         std::swap(TrueVal, FalseVal);
5216         std::swap(TVal, FVal);
5217         CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5218       }
5219 
5220       // Attempt to use ZR checking TVal is 0, possibly inverting the condition
5221       // to get there. CSINC not is invertable like the other two (~(~a) == a,
5222       // -(-a) == a, but (a+1)+1 != a).
5223       if (FVal == 0 && Opcode != ARMISD::CSINC) {
5224         std::swap(TrueVal, FalseVal);
5225         std::swap(TVal, FVal);
5226         CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5227       }
5228       if (TVal == 0)
5229         TrueVal = DAG.getRegister(ARM::ZR, MVT::i32);
5230 
5231       // Drops F's value because we can get it by inverting/negating TVal.
5232       FalseVal = TrueVal;
5233 
5234       SDValue ARMcc;
5235       SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5236       EVT VT = TrueVal.getValueType();
5237       return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp);
5238     }
5239   }
5240 
5241   if (isUnsupportedFloatingType(LHS.getValueType())) {
5242     DAG.getTargetLoweringInfo().softenSetCCOperands(
5243         DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5244 
5245     // If softenSetCCOperands only returned one value, we should compare it to
5246     // zero.
5247     if (!RHS.getNode()) {
5248       RHS = DAG.getConstant(0, dl, LHS.getValueType());
5249       CC = ISD::SETNE;
5250     }
5251   }
5252 
5253   if (LHS.getValueType() == MVT::i32) {
5254     // Try to generate VSEL on ARMv8.
5255     // The VSEL instruction can't use all the usual ARM condition
5256     // codes: it only has two bits to select the condition code, so it's
5257     // constrained to use only GE, GT, VS and EQ.
5258     //
5259     // To implement all the various ISD::SETXXX opcodes, we sometimes need to
5260     // swap the operands of the previous compare instruction (effectively
5261     // inverting the compare condition, swapping 'less' and 'greater') and
5262     // sometimes need to swap the operands to the VSEL (which inverts the
5263     // condition in the sense of firing whenever the previous condition didn't)
5264     if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 ||
5265                                         TrueVal.getValueType() == MVT::f32 ||
5266                                         TrueVal.getValueType() == MVT::f64)) {
5267       ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
5268       if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
5269           CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
5270         CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5271         std::swap(TrueVal, FalseVal);
5272       }
5273     }
5274 
5275     SDValue ARMcc;
5276     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5277     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5278     // Choose GE over PL, which vsel does now support
5279     if (cast<ConstantSDNode>(ARMcc)->getZExtValue() == ARMCC::PL)
5280       ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);
5281     return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
5282   }
5283 
5284   ARMCC::CondCodes CondCode, CondCode2;
5285   FPCCToARMCC(CC, CondCode, CondCode2);
5286 
5287   // Normalize the fp compare. If RHS is zero we prefer to keep it there so we
5288   // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
5289   // must use VSEL (limited condition codes), due to not having conditional f16
5290   // moves.
5291   if (Subtarget->hasFPARMv8Base() &&
5292       !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) &&
5293       (TrueVal.getValueType() == MVT::f16 ||
5294        TrueVal.getValueType() == MVT::f32 ||
5295        TrueVal.getValueType() == MVT::f64)) {
5296     bool swpCmpOps = false;
5297     bool swpVselOps = false;
5298     checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
5299 
5300     if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
5301         CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
5302       if (swpCmpOps)
5303         std::swap(LHS, RHS);
5304       if (swpVselOps)
5305         std::swap(TrueVal, FalseVal);
5306     }
5307   }
5308 
5309   SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5310   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5311   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5312   SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
5313   if (CondCode2 != ARMCC::AL) {
5314     SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
5315     // FIXME: Needs another CMP because flag can have but one use.
5316     SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
5317     Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG);
5318   }
5319   return Result;
5320 }
5321 
5322 /// canChangeToInt - Given the fp compare operand, return true if it is suitable
5323 /// to morph to an integer compare sequence.
5324 static bool canChangeToInt(SDValue Op, bool &SeenZero,
5325                            const ARMSubtarget *Subtarget) {
5326   SDNode *N = Op.getNode();
5327   if (!N->hasOneUse())
5328     // Otherwise it requires moving the value from fp to integer registers.
5329     return false;
5330   if (!N->getNumValues())
5331     return false;
5332   EVT VT = Op.getValueType();
5333   if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
5334     // f32 case is generally profitable. f64 case only makes sense when vcmpe +
5335     // vmrs are very slow, e.g. cortex-a8.
5336     return false;
5337 
5338   if (isFloatingPointZero(Op)) {
5339     SeenZero = true;
5340     return true;
5341   }
5342   return ISD::isNormalLoad(N);
5343 }
5344 
5345 static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
5346   if (isFloatingPointZero(Op))
5347     return DAG.getConstant(0, SDLoc(Op), MVT::i32);
5348 
5349   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
5350     return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
5351                        Ld->getPointerInfo(), Ld->getAlignment(),
5352                        Ld->getMemOperand()->getFlags());
5353 
5354   llvm_unreachable("Unknown VFP cmp argument!");
5355 }
5356 
5357 static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
5358                            SDValue &RetVal1, SDValue &RetVal2) {
5359   SDLoc dl(Op);
5360 
5361   if (isFloatingPointZero(Op)) {
5362     RetVal1 = DAG.getConstant(0, dl, MVT::i32);
5363     RetVal2 = DAG.getConstant(0, dl, MVT::i32);
5364     return;
5365   }
5366 
5367   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
5368     SDValue Ptr = Ld->getBasePtr();
5369     RetVal1 =
5370         DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
5371                     Ld->getAlignment(), Ld->getMemOperand()->getFlags());
5372 
5373     EVT PtrType = Ptr.getValueType();
5374     unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
5375     SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
5376                                  PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
5377     RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
5378                           Ld->getPointerInfo().getWithOffset(4), NewAlign,
5379                           Ld->getMemOperand()->getFlags());
5380     return;
5381   }
5382 
5383   llvm_unreachable("Unknown VFP cmp argument!");
5384 }
5385 
5386 /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
5387 /// f32 and even f64 comparisons to integer ones.
5388 SDValue
5389 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
5390   SDValue Chain = Op.getOperand(0);
5391   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5392   SDValue LHS = Op.getOperand(2);
5393   SDValue RHS = Op.getOperand(3);
5394   SDValue Dest = Op.getOperand(4);
5395   SDLoc dl(Op);
5396 
5397   bool LHSSeenZero = false;
5398   bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
5399   bool RHSSeenZero = false;
5400   bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
5401   if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
5402     // If unsafe fp math optimization is enabled and there are no other uses of
5403     // the CMP operands, and the condition code is EQ or NE, we can optimize it
5404     // to an integer comparison.
5405     if (CC == ISD::SETOEQ)
5406       CC = ISD::SETEQ;
5407     else if (CC == ISD::SETUNE)
5408       CC = ISD::SETNE;
5409 
5410     SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5411     SDValue ARMcc;
5412     if (LHS.getValueType() == MVT::f32) {
5413       LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5414                         bitcastf32Toi32(LHS, DAG), Mask);
5415       RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5416                         bitcastf32Toi32(RHS, DAG), Mask);
5417       SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5418       SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5419       return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
5420                          Chain, Dest, ARMcc, CCR, Cmp);
5421     }
5422 
5423     SDValue LHS1, LHS2;
5424     SDValue RHS1, RHS2;
5425     expandf64Toi32(LHS, DAG, LHS1, LHS2);
5426     expandf64Toi32(RHS, DAG, RHS1, RHS2);
5427     LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
5428     RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
5429     ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
5430     ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5431     SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
5432     SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
5433     return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops);
5434   }
5435 
5436   return SDValue();
5437 }
5438 
5439 SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
5440   SDValue Chain = Op.getOperand(0);
5441   SDValue Cond = Op.getOperand(1);
5442   SDValue Dest = Op.getOperand(2);
5443   SDLoc dl(Op);
5444 
5445   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5446   // instruction.
5447   unsigned Opc = Cond.getOpcode();
5448   bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5449                       !Subtarget->isThumb1Only();
5450   if (Cond.getResNo() == 1 &&
5451       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5452        Opc == ISD::USUBO || OptimizeMul)) {
5453     // Only lower legal XALUO ops.
5454     if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
5455       return SDValue();
5456 
5457     // The actual operation with overflow check.
5458     SDValue Value, OverflowCmp;
5459     SDValue ARMcc;
5460     std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5461 
5462     // Reverse the condition code.
5463     ARMCC::CondCodes CondCode =
5464         (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5465     CondCode = ARMCC::getOppositeCondition(CondCode);
5466     ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5467     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5468 
5469     return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
5470                        OverflowCmp);
5471   }
5472 
5473   return SDValue();
5474 }
5475 
5476 SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
5477   SDValue Chain = Op.getOperand(0);
5478   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5479   SDValue LHS = Op.getOperand(2);
5480   SDValue RHS = Op.getOperand(3);
5481   SDValue Dest = Op.getOperand(4);
5482   SDLoc dl(Op);
5483 
5484   if (isUnsupportedFloatingType(LHS.getValueType())) {
5485     DAG.getTargetLoweringInfo().softenSetCCOperands(
5486         DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5487 
5488     // If softenSetCCOperands only returned one value, we should compare it to
5489     // zero.
5490     if (!RHS.getNode()) {
5491       RHS = DAG.getConstant(0, dl, LHS.getValueType());
5492       CC = ISD::SETNE;
5493     }
5494   }
5495 
5496   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5497   // instruction.
5498   unsigned Opc = LHS.getOpcode();
5499   bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5500                       !Subtarget->isThumb1Only();
5501   if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
5502       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5503        Opc == ISD::USUBO || OptimizeMul) &&
5504       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
5505     // Only lower legal XALUO ops.
5506     if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
5507       return SDValue();
5508 
5509     // The actual operation with overflow check.
5510     SDValue Value, OverflowCmp;
5511     SDValue ARMcc;
5512     std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc);
5513 
5514     if ((CC == ISD::SETNE) != isOneConstant(RHS)) {
5515       // Reverse the condition code.
5516       ARMCC::CondCodes CondCode =
5517           (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5518       CondCode = ARMCC::getOppositeCondition(CondCode);
5519       ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5520     }
5521     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5522 
5523     return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
5524                        OverflowCmp);
5525   }
5526 
5527   if (LHS.getValueType() == MVT::i32) {
5528     SDValue ARMcc;
5529     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5530     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5531     return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
5532                        Chain, Dest, ARMcc, CCR, Cmp);
5533   }
5534 
5535   if (getTargetMachine().Options.UnsafeFPMath &&
5536       (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
5537        CC == ISD::SETNE || CC == ISD::SETUNE)) {
5538     if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
5539       return Result;
5540   }
5541 
5542   ARMCC::CondCodes CondCode, CondCode2;
5543   FPCCToARMCC(CC, CondCode, CondCode2);
5544 
5545   SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5546   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5547   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5548   SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
5549   SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
5550   SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
5551   if (CondCode2 != ARMCC::AL) {
5552     ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
5553     SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };
5554     Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
5555   }
5556   return Res;
5557 }
5558 
5559 SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
5560   SDValue Chain = Op.getOperand(0);
5561   SDValue Table = Op.getOperand(1);
5562   SDValue Index = Op.getOperand(2);
5563   SDLoc dl(Op);
5564 
5565   EVT PTy = getPointerTy(DAG.getDataLayout());
5566   JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
5567   SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
5568   Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
5569   Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
5570   SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index);
5571   if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
5572     // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
5573     // which does another jump to the destination. This also makes it easier
5574     // to translate it to TBB / TBH later (Thumb2 only).
5575     // FIXME: This might not work if the function is extremely large.
5576     return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
5577                        Addr, Op.getOperand(2), JTI);
5578   }
5579   if (isPositionIndependent() || Subtarget->isROPI()) {
5580     Addr =
5581         DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
5582                     MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
5583     Chain = Addr.getValue(1);
5584     Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr);
5585     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5586   } else {
5587     Addr =
5588         DAG.getLoad(PTy, dl, Chain, Addr,
5589                     MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
5590     Chain = Addr.getValue(1);
5591     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5592   }
5593 }
5594 
5595 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
5596   EVT VT = Op.getValueType();
5597   SDLoc dl(Op);
5598 
5599   if (Op.getValueType().getVectorElementType() == MVT::i32) {
5600     if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
5601       return Op;
5602     return DAG.UnrollVectorOp(Op.getNode());
5603   }
5604 
5605   const bool HasFullFP16 =
5606     static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16();
5607 
5608   EVT NewTy;
5609   const EVT OpTy = Op.getOperand(0).getValueType();
5610   if (OpTy == MVT::v4f32)
5611     NewTy = MVT::v4i32;
5612   else if (OpTy == MVT::v4f16 && HasFullFP16)
5613     NewTy = MVT::v4i16;
5614   else if (OpTy == MVT::v8f16 && HasFullFP16)
5615     NewTy = MVT::v8i16;
5616   else
5617     llvm_unreachable("Invalid type for custom lowering!");
5618 
5619   if (VT != MVT::v4i16 && VT != MVT::v8i16)
5620     return DAG.UnrollVectorOp(Op.getNode());
5621 
5622   Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0));
5623   return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
5624 }
5625 
5626 SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
5627   EVT VT = Op.getValueType();
5628   if (VT.isVector())
5629     return LowerVectorFP_TO_INT(Op, DAG);
5630 
5631   bool IsStrict = Op->isStrictFPOpcode();
5632   SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5633 
5634   if (isUnsupportedFloatingType(SrcVal.getValueType())) {
5635     RTLIB::Libcall LC;
5636     if (Op.getOpcode() == ISD::FP_TO_SINT ||
5637         Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
5638       LC = RTLIB::getFPTOSINT(SrcVal.getValueType(),
5639                               Op.getValueType());
5640     else
5641       LC = RTLIB::getFPTOUINT(SrcVal.getValueType(),
5642                               Op.getValueType());
5643     SDLoc Loc(Op);
5644     MakeLibCallOptions CallOptions;
5645     SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
5646     SDValue Result;
5647     std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
5648                                           CallOptions, Loc, Chain);
5649     return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
5650   }
5651 
5652   // FIXME: Remove this when we have strict fp instruction selection patterns
5653   if (IsStrict) {
5654     SDLoc Loc(Op);
5655     SDValue Result =
5656         DAG.getNode(Op.getOpcode() == ISD::STRICT_FP_TO_SINT ? ISD::FP_TO_SINT
5657                                                              : ISD::FP_TO_UINT,
5658                     Loc, Op.getValueType(), SrcVal);
5659     return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
5660   }
5661 
5662   return Op;
5663 }
5664 
5665 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
5666   EVT VT = Op.getValueType();
5667   SDLoc dl(Op);
5668 
5669   if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
5670     if (VT.getVectorElementType() == MVT::f32)
5671       return Op;
5672     return DAG.UnrollVectorOp(Op.getNode());
5673   }
5674 
5675   assert((Op.getOperand(0).getValueType() == MVT::v4i16 ||
5676           Op.getOperand(0).getValueType() == MVT::v8i16) &&
5677          "Invalid type for custom lowering!");
5678 
5679   const bool HasFullFP16 =
5680     static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16();
5681 
5682   EVT DestVecType;
5683   if (VT == MVT::v4f32)
5684     DestVecType = MVT::v4i32;
5685   else if (VT == MVT::v4f16 && HasFullFP16)
5686     DestVecType = MVT::v4i16;
5687   else if (VT == MVT::v8f16 && HasFullFP16)
5688     DestVecType = MVT::v8i16;
5689   else
5690     return DAG.UnrollVectorOp(Op.getNode());
5691 
5692   unsigned CastOpc;
5693   unsigned Opc;
5694   switch (Op.getOpcode()) {
5695   default: llvm_unreachable("Invalid opcode!");
5696   case ISD::SINT_TO_FP:
5697     CastOpc = ISD::SIGN_EXTEND;
5698     Opc = ISD::SINT_TO_FP;
5699     break;
5700   case ISD::UINT_TO_FP:
5701     CastOpc = ISD::ZERO_EXTEND;
5702     Opc = ISD::UINT_TO_FP;
5703     break;
5704   }
5705 
5706   Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0));
5707   return DAG.getNode(Opc, dl, VT, Op);
5708 }
5709 
5710 SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
5711   EVT VT = Op.getValueType();
5712   if (VT.isVector())
5713     return LowerVectorINT_TO_FP(Op, DAG);
5714   if (isUnsupportedFloatingType(VT)) {
5715     RTLIB::Libcall LC;
5716     if (Op.getOpcode() == ISD::SINT_TO_FP)
5717       LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
5718                               Op.getValueType());
5719     else
5720       LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
5721                               Op.getValueType());
5722     MakeLibCallOptions CallOptions;
5723     return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
5724                        CallOptions, SDLoc(Op)).first;
5725   }
5726 
5727   return Op;
5728 }
5729 
5730 SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
5731   // Implement fcopysign with a fabs and a conditional fneg.
5732   SDValue Tmp0 = Op.getOperand(0);
5733   SDValue Tmp1 = Op.getOperand(1);
5734   SDLoc dl(Op);
5735   EVT VT = Op.getValueType();
5736   EVT SrcVT = Tmp1.getValueType();
5737   bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
5738     Tmp0.getOpcode() == ARMISD::VMOVDRR;
5739   bool UseNEON = !InGPR && Subtarget->hasNEON();
5740 
5741   if (UseNEON) {
5742     // Use VBSL to copy the sign bit.
5743     unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80);
5744     SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
5745                                DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
5746     EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
5747     if (VT == MVT::f64)
5748       Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5749                          DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
5750                          DAG.getConstant(32, dl, MVT::i32));
5751     else /*if (VT == MVT::f32)*/
5752       Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
5753     if (SrcVT == MVT::f32) {
5754       Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
5755       if (VT == MVT::f64)
5756         Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5757                            DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
5758                            DAG.getConstant(32, dl, MVT::i32));
5759     } else if (VT == MVT::f32)
5760       Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64,
5761                          DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
5762                          DAG.getConstant(32, dl, MVT::i32));
5763     Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
5764     Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
5765 
5766     SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff),
5767                                             dl, MVT::i32);
5768     AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
5769     SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
5770                                   DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
5771 
5772     SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
5773                               DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
5774                               DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
5775     if (VT == MVT::f32) {
5776       Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
5777       Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
5778                         DAG.getConstant(0, dl, MVT::i32));
5779     } else {
5780       Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
5781     }
5782 
5783     return Res;
5784   }
5785 
5786   // Bitcast operand 1 to i32.
5787   if (SrcVT == MVT::f64)
5788     Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
5789                        Tmp1).getValue(1);
5790   Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
5791 
5792   // Or in the signbit with integer operations.
5793   SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
5794   SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5795   Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
5796   if (VT == MVT::f32) {
5797     Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
5798                        DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
5799     return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
5800                        DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
5801   }
5802 
5803   // f64: Or the high part with signbit and then combine two parts.
5804   Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
5805                      Tmp0);
5806   SDValue Lo = Tmp0.getValue(0);
5807   SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
5808   Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
5809   return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
5810 }
5811 
5812 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
5813   MachineFunction &MF = DAG.getMachineFunction();
5814   MachineFrameInfo &MFI = MF.getFrameInfo();
5815   MFI.setReturnAddressIsTaken(true);
5816 
5817   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
5818     return SDValue();
5819 
5820   EVT VT = Op.getValueType();
5821   SDLoc dl(Op);
5822   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
5823   if (Depth) {
5824     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
5825     SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
5826     return DAG.getLoad(VT, dl, DAG.getEntryNode(),
5827                        DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
5828                        MachinePointerInfo());
5829   }
5830 
5831   // Return LR, which contains the return address. Mark it an implicit live-in.
5832   unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
5833   return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
5834 }
5835 
5836 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
5837   const ARMBaseRegisterInfo &ARI =
5838     *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
5839   MachineFunction &MF = DAG.getMachineFunction();
5840   MachineFrameInfo &MFI = MF.getFrameInfo();
5841   MFI.setFrameAddressIsTaken(true);
5842 
5843   EVT VT = Op.getValueType();
5844   SDLoc dl(Op);  // FIXME probably not meaningful
5845   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
5846   Register FrameReg = ARI.getFrameRegister(MF);
5847   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
5848   while (Depth--)
5849     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
5850                             MachinePointerInfo());
5851   return FrameAddr;
5852 }
5853 
5854 // FIXME? Maybe this could be a TableGen attribute on some registers and
5855 // this table could be generated automatically from RegInfo.
5856 Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT,
5857                                               const MachineFunction &MF) const {
5858   Register Reg = StringSwitch<unsigned>(RegName)
5859                        .Case("sp", ARM::SP)
5860                        .Default(0);
5861   if (Reg)
5862     return Reg;
5863   report_fatal_error(Twine("Invalid register name \""
5864                               + StringRef(RegName)  + "\"."));
5865 }
5866 
5867 // Result is 64 bit value so split into two 32 bit values and return as a
5868 // pair of values.
5869 static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results,
5870                                 SelectionDAG &DAG) {
5871   SDLoc DL(N);
5872 
5873   // This function is only supposed to be called for i64 type destination.
5874   assert(N->getValueType(0) == MVT::i64
5875           && "ExpandREAD_REGISTER called for non-i64 type result.");
5876 
5877   SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL,
5878                              DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
5879                              N->getOperand(0),
5880                              N->getOperand(1));
5881 
5882   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
5883                     Read.getValue(1)));
5884   Results.push_back(Read.getOperand(0));
5885 }
5886 
5887 /// \p BC is a bitcast that is about to be turned into a VMOVDRR.
5888 /// When \p DstVT, the destination type of \p BC, is on the vector
5889 /// register bank and the source of bitcast, \p Op, operates on the same bank,
5890 /// it might be possible to combine them, such that everything stays on the
5891 /// vector register bank.
5892 /// \p return The node that would replace \p BT, if the combine
5893 /// is possible.
5894 static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
5895                                                 SelectionDAG &DAG) {
5896   SDValue Op = BC->getOperand(0);
5897   EVT DstVT = BC->getValueType(0);
5898 
5899   // The only vector instruction that can produce a scalar (remember,
5900   // since the bitcast was about to be turned into VMOVDRR, the source
5901   // type is i64) from a vector is EXTRACT_VECTOR_ELT.
5902   // Moreover, we can do this combine only if there is one use.
5903   // Finally, if the destination type is not a vector, there is not
5904   // much point on forcing everything on the vector bank.
5905   if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5906       !Op.hasOneUse())
5907     return SDValue();
5908 
5909   // If the index is not constant, we will introduce an additional
5910   // multiply that will stick.
5911   // Give up in that case.
5912   ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
5913   if (!Index)
5914     return SDValue();
5915   unsigned DstNumElt = DstVT.getVectorNumElements();
5916 
5917   // Compute the new index.
5918   const APInt &APIntIndex = Index->getAPIntValue();
5919   APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
5920   NewIndex *= APIntIndex;
5921   // Check if the new constant index fits into i32.
5922   if (NewIndex.getBitWidth() > 32)
5923     return SDValue();
5924 
5925   // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
5926   // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
5927   SDLoc dl(Op);
5928   SDValue ExtractSrc = Op.getOperand(0);
5929   EVT VecVT = EVT::getVectorVT(
5930       *DAG.getContext(), DstVT.getScalarType(),
5931       ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
5932   SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
5933   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
5934                      DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
5935 }
5936 
5937 /// ExpandBITCAST - If the target supports VFP, this function is called to
5938 /// expand a bit convert where either the source or destination type is i64 to
5939 /// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
5940 /// operand type is illegal (e.g., v2f32 for a target that doesn't support
5941 /// vectors), since the legalizer won't know what to do with that.
5942 SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
5943                                          const ARMSubtarget *Subtarget) const {
5944   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5945   SDLoc dl(N);
5946   SDValue Op = N->getOperand(0);
5947 
5948   // This function is only supposed to be called for i16 and i64 types, either
5949   // as the source or destination of the bit convert.
5950   EVT SrcVT = Op.getValueType();
5951   EVT DstVT = N->getValueType(0);
5952 
5953   if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) &&
5954       (DstVT == MVT::f16 || DstVT == MVT::bf16))
5955     return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(),
5956                      DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op));
5957 
5958   if ((DstVT == MVT::i16 || DstVT == MVT::i32) &&
5959       (SrcVT == MVT::f16 || SrcVT == MVT::bf16))
5960     return DAG.getNode(
5961         ISD::TRUNCATE, SDLoc(N), DstVT,
5962         MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op));
5963 
5964   if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
5965     return SDValue();
5966 
5967   // Turn i64->f64 into VMOVDRR.
5968   if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
5969     // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
5970     // if we can combine the bitcast with its source.
5971     if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG))
5972       return Val;
5973 
5974     SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
5975                              DAG.getConstant(0, dl, MVT::i32));
5976     SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
5977                              DAG.getConstant(1, dl, MVT::i32));
5978     return DAG.getNode(ISD::BITCAST, dl, DstVT,
5979                        DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
5980   }
5981 
5982   // Turn f64->i64 into VMOVRRD.
5983   if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
5984     SDValue Cvt;
5985     if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
5986         SrcVT.getVectorNumElements() > 1)
5987       Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
5988                         DAG.getVTList(MVT::i32, MVT::i32),
5989                         DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
5990     else
5991       Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
5992                         DAG.getVTList(MVT::i32, MVT::i32), Op);
5993     // Merge the pieces into a single i64 value.
5994     return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
5995   }
5996 
5997   return SDValue();
5998 }
5999 
6000 /// getZeroVector - Returns a vector of specified type with all zero elements.
6001 /// Zero vectors are used to represent vector negation and in those cases
6002 /// will be implemented with the NEON VNEG instruction.  However, VNEG does
6003 /// not support i64 elements, so sometimes the zero vectors will need to be
6004 /// explicitly constructed.  Regardless, use a canonical VMOV to create the
6005 /// zero vector.
6006 static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6007   assert(VT.isVector() && "Expected a vector type");
6008   // The canonical modified immediate encoding of a zero vector is....0!
6009   SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
6010   EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
6011   SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
6012   return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
6013 }
6014 
6015 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two
6016 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
6017 SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
6018                                                 SelectionDAG &DAG) const {
6019   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6020   EVT VT = Op.getValueType();
6021   unsigned VTBits = VT.getSizeInBits();
6022   SDLoc dl(Op);
6023   SDValue ShOpLo = Op.getOperand(0);
6024   SDValue ShOpHi = Op.getOperand(1);
6025   SDValue ShAmt  = Op.getOperand(2);
6026   SDValue ARMcc;
6027   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
6028   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
6029 
6030   assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
6031 
6032   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6033                                  DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6034   SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
6035   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6036                                    DAG.getConstant(VTBits, dl, MVT::i32));
6037   SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
6038   SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6039   SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
6040   SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6041                             ISD::SETGE, ARMcc, DAG, dl);
6042   SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift,
6043                            ARMcc, CCR, CmpLo);
6044 
6045   SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
6046   SDValue HiBigShift = Opc == ISD::SRA
6047                            ? DAG.getNode(Opc, dl, VT, ShOpHi,
6048                                          DAG.getConstant(VTBits - 1, dl, VT))
6049                            : DAG.getConstant(0, dl, VT);
6050   SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6051                             ISD::SETGE, ARMcc, DAG, dl);
6052   SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
6053                            ARMcc, CCR, CmpHi);
6054 
6055   SDValue Ops[2] = { Lo, Hi };
6056   return DAG.getMergeValues(Ops, dl);
6057 }
6058 
6059 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
6060 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
6061 SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
6062                                                SelectionDAG &DAG) const {
6063   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6064   EVT VT = Op.getValueType();
6065   unsigned VTBits = VT.getSizeInBits();
6066   SDLoc dl(Op);
6067   SDValue ShOpLo = Op.getOperand(0);
6068   SDValue ShOpHi = Op.getOperand(1);
6069   SDValue ShAmt  = Op.getOperand(2);
6070   SDValue ARMcc;
6071   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
6072 
6073   assert(Op.getOpcode() == ISD::SHL_PARTS);
6074   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6075                                  DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6076   SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
6077   SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
6078   SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6079 
6080   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6081                                    DAG.getConstant(VTBits, dl, MVT::i32));
6082   SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
6083   SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6084                             ISD::SETGE, ARMcc, DAG, dl);
6085   SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
6086                            ARMcc, CCR, CmpHi);
6087 
6088   SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6089                           ISD::SETGE, ARMcc, DAG, dl);
6090   SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
6091   SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,
6092                            DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo);
6093 
6094   SDValue Ops[2] = { Lo, Hi };
6095   return DAG.getMergeValues(Ops, dl);
6096 }
6097 
6098 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
6099                                             SelectionDAG &DAG) const {
6100   // The rounding mode is in bits 23:22 of the FPSCR.
6101   // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
6102   // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
6103   // so that the shift + and get folded into a bitfield extract.
6104   SDLoc dl(Op);
6105   SDValue Chain = Op.getOperand(0);
6106   SDValue Ops[] = {Chain,
6107                    DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)};
6108 
6109   SDValue FPSCR =
6110       DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops);
6111   Chain = FPSCR.getValue(1);
6112   SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
6113                                   DAG.getConstant(1U << 22, dl, MVT::i32));
6114   SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
6115                               DAG.getConstant(22, dl, MVT::i32));
6116   SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
6117                             DAG.getConstant(3, dl, MVT::i32));
6118   return DAG.getMergeValues({And, Chain}, dl);
6119 }
6120 
6121 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
6122                          const ARMSubtarget *ST) {
6123   SDLoc dl(N);
6124   EVT VT = N->getValueType(0);
6125   if (VT.isVector() && ST->hasNEON()) {
6126 
6127     // Compute the least significant set bit: LSB = X & -X
6128     SDValue X = N->getOperand(0);
6129     SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
6130     SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
6131 
6132     EVT ElemTy = VT.getVectorElementType();
6133 
6134     if (ElemTy == MVT::i8) {
6135       // Compute with: cttz(x) = ctpop(lsb - 1)
6136       SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6137                                 DAG.getTargetConstant(1, dl, ElemTy));
6138       SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6139       return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6140     }
6141 
6142     if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
6143         (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
6144       // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
6145       unsigned NumBits = ElemTy.getSizeInBits();
6146       SDValue WidthMinus1 =
6147           DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6148                       DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
6149       SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
6150       return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
6151     }
6152 
6153     // Compute with: cttz(x) = ctpop(lsb - 1)
6154 
6155     // Compute LSB - 1.
6156     SDValue Bits;
6157     if (ElemTy == MVT::i64) {
6158       // Load constant 0xffff'ffff'ffff'ffff to register.
6159       SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6160                                DAG.getTargetConstant(0x1eff, dl, MVT::i32));
6161       Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
6162     } else {
6163       SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6164                                 DAG.getTargetConstant(1, dl, ElemTy));
6165       Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6166     }
6167     return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6168   }
6169 
6170   if (!ST->hasV6T2Ops())
6171     return SDValue();
6172 
6173   SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
6174   return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
6175 }
6176 
6177 static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
6178                           const ARMSubtarget *ST) {
6179   EVT VT = N->getValueType(0);
6180   SDLoc DL(N);
6181 
6182   assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
6183   assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
6184           VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
6185          "Unexpected type for custom ctpop lowering");
6186 
6187   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6188   EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
6189   SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));
6190   Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);
6191 
6192   // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
6193   unsigned EltSize = 8;
6194   unsigned NumElts = VT.is64BitVector() ? 8 : 16;
6195   while (EltSize != VT.getScalarSizeInBits()) {
6196     SmallVector<SDValue, 8> Ops;
6197     Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
6198                                   TLI.getPointerTy(DAG.getDataLayout())));
6199     Ops.push_back(Res);
6200 
6201     EltSize *= 2;
6202     NumElts /= 2;
6203     MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
6204     Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);
6205   }
6206 
6207   return Res;
6208 }
6209 
6210 /// Getvshiftimm - Check if this is a valid build_vector for the immediate
6211 /// operand of a vector shift operation, where all the elements of the
6212 /// build_vector must have the same constant integer value.
6213 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
6214   // Ignore bit_converts.
6215   while (Op.getOpcode() == ISD::BITCAST)
6216     Op = Op.getOperand(0);
6217   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
6218   APInt SplatBits, SplatUndef;
6219   unsigned SplatBitSize;
6220   bool HasAnyUndefs;
6221   if (!BVN ||
6222       !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
6223                             ElementBits) ||
6224       SplatBitSize > ElementBits)
6225     return false;
6226   Cnt = SplatBits.getSExtValue();
6227   return true;
6228 }
6229 
6230 /// isVShiftLImm - Check if this is a valid build_vector for the immediate
6231 /// operand of a vector shift left operation.  That value must be in the range:
6232 ///   0 <= Value < ElementBits for a left shift; or
6233 ///   0 <= Value <= ElementBits for a long left shift.
6234 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
6235   assert(VT.isVector() && "vector shift count is not a vector type");
6236   int64_t ElementBits = VT.getScalarSizeInBits();
6237   if (!getVShiftImm(Op, ElementBits, Cnt))
6238     return false;
6239   return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
6240 }
6241 
6242 /// isVShiftRImm - Check if this is a valid build_vector for the immediate
6243 /// operand of a vector shift right operation.  For a shift opcode, the value
6244 /// is positive, but for an intrinsic the value count must be negative. The
6245 /// absolute value must be in the range:
6246 ///   1 <= |Value| <= ElementBits for a right shift; or
6247 ///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
6248 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
6249                          int64_t &Cnt) {
6250   assert(VT.isVector() && "vector shift count is not a vector type");
6251   int64_t ElementBits = VT.getScalarSizeInBits();
6252   if (!getVShiftImm(Op, ElementBits, Cnt))
6253     return false;
6254   if (!isIntrinsic)
6255     return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
6256   if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) {
6257     Cnt = -Cnt;
6258     return true;
6259   }
6260   return false;
6261 }
6262 
6263 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
6264                           const ARMSubtarget *ST) {
6265   EVT VT = N->getValueType(0);
6266   SDLoc dl(N);
6267   int64_t Cnt;
6268 
6269   if (!VT.isVector())
6270     return SDValue();
6271 
6272   // We essentially have two forms here. Shift by an immediate and shift by a
6273   // vector register (there are also shift by a gpr, but that is just handled
6274   // with a tablegen pattern). We cannot easily match shift by an immediate in
6275   // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.
6276   // For shifting by a vector, we don't have VSHR, only VSHL (which can be
6277   // signed or unsigned, and a negative shift indicates a shift right).
6278   if (N->getOpcode() == ISD::SHL) {
6279     if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
6280       return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
6281                          DAG.getConstant(Cnt, dl, MVT::i32));
6282     return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0),
6283                        N->getOperand(1));
6284   }
6285 
6286   assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
6287          "unexpected vector shift opcode");
6288 
6289   if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
6290     unsigned VShiftOpc =
6291         (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
6292     return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
6293                        DAG.getConstant(Cnt, dl, MVT::i32));
6294   }
6295 
6296   // Other right shifts we don't have operations for (we use a shift left by a
6297   // negative number).
6298   EVT ShiftVT = N->getOperand(1).getValueType();
6299   SDValue NegatedCount = DAG.getNode(
6300       ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1));
6301   unsigned VShiftOpc =
6302       (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu);
6303   return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount);
6304 }
6305 
6306 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
6307                                 const ARMSubtarget *ST) {
6308   EVT VT = N->getValueType(0);
6309   SDLoc dl(N);
6310 
6311   // We can get here for a node like i32 = ISD::SHL i32, i64
6312   if (VT != MVT::i64)
6313     return SDValue();
6314 
6315   assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA ||
6316           N->getOpcode() == ISD::SHL) &&
6317          "Unknown shift to lower!");
6318 
6319   unsigned ShOpc = N->getOpcode();
6320   if (ST->hasMVEIntegerOps()) {
6321     SDValue ShAmt = N->getOperand(1);
6322     unsigned ShPartsOpc = ARMISD::LSLL;
6323     ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt);
6324 
6325     // If the shift amount is greater than 32 or has a greater bitwidth than 64
6326     // then do the default optimisation
6327     if (ShAmt->getValueType(0).getSizeInBits() > 64 ||
6328         (Con && (Con->getZExtValue() == 0 || Con->getZExtValue() >= 32)))
6329       return SDValue();
6330 
6331     // Extract the lower 32 bits of the shift amount if it's not an i32
6332     if (ShAmt->getValueType(0) != MVT::i32)
6333       ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32);
6334 
6335     if (ShOpc == ISD::SRL) {
6336       if (!Con)
6337         // There is no t2LSRLr instruction so negate and perform an lsll if the
6338         // shift amount is in a register, emulating a right shift.
6339         ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6340                             DAG.getConstant(0, dl, MVT::i32), ShAmt);
6341       else
6342         // Else generate an lsrl on the immediate shift amount
6343         ShPartsOpc = ARMISD::LSRL;
6344     } else if (ShOpc == ISD::SRA)
6345       ShPartsOpc = ARMISD::ASRL;
6346 
6347     // Lower 32 bits of the destination/source
6348     SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
6349                              DAG.getConstant(0, dl, MVT::i32));
6350     // Upper 32 bits of the destination/source
6351     SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
6352                              DAG.getConstant(1, dl, MVT::i32));
6353 
6354     // Generate the shift operation as computed above
6355     Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi,
6356                      ShAmt);
6357     // The upper 32 bits come from the second return value of lsll
6358     Hi = SDValue(Lo.getNode(), 1);
6359     return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6360   }
6361 
6362   // We only lower SRA, SRL of 1 here, all others use generic lowering.
6363   if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL)
6364     return SDValue();
6365 
6366   // If we are in thumb mode, we don't have RRX.
6367   if (ST->isThumb1Only())
6368     return SDValue();
6369 
6370   // Okay, we have a 64-bit SRA or SRL of 1.  Lower this to an RRX expr.
6371   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
6372                            DAG.getConstant(0, dl, MVT::i32));
6373   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
6374                            DAG.getConstant(1, dl, MVT::i32));
6375 
6376   // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
6377   // captures the result into a carry flag.
6378   unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
6379   Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi);
6380 
6381   // The low part is an ARMISD::RRX operand, which shifts the carry in.
6382   Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
6383 
6384   // Merge the pieces into a single i64 value.
6385  return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6386 }
6387 
6388 static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG,
6389                            const ARMSubtarget *ST) {
6390   bool Invert = false;
6391   bool Swap = false;
6392   unsigned Opc = ARMCC::AL;
6393 
6394   SDValue Op0 = Op.getOperand(0);
6395   SDValue Op1 = Op.getOperand(1);
6396   SDValue CC = Op.getOperand(2);
6397   EVT VT = Op.getValueType();
6398   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
6399   SDLoc dl(Op);
6400 
6401   EVT CmpVT;
6402   if (ST->hasNEON())
6403     CmpVT = Op0.getValueType().changeVectorElementTypeToInteger();
6404   else {
6405     assert(ST->hasMVEIntegerOps() &&
6406            "No hardware support for integer vector comparison!");
6407 
6408     if (Op.getValueType().getVectorElementType() != MVT::i1)
6409       return SDValue();
6410 
6411     // Make sure we expand floating point setcc to scalar if we do not have
6412     // mve.fp, so that we can handle them from there.
6413     if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps())
6414       return SDValue();
6415 
6416     CmpVT = VT;
6417   }
6418 
6419   if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
6420       (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
6421     // Special-case integer 64-bit equality comparisons. They aren't legal,
6422     // but they can be lowered with a few vector instructions.
6423     unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
6424     EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
6425     SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
6426     SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
6427     SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
6428                               DAG.getCondCode(ISD::SETEQ));
6429     SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
6430     SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
6431     Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
6432     if (SetCCOpcode == ISD::SETNE)
6433       Merged = DAG.getNOT(dl, Merged, CmpVT);
6434     Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
6435     return Merged;
6436   }
6437 
6438   if (CmpVT.getVectorElementType() == MVT::i64)
6439     // 64-bit comparisons are not legal in general.
6440     return SDValue();
6441 
6442   if (Op1.getValueType().isFloatingPoint()) {
6443     switch (SetCCOpcode) {
6444     default: llvm_unreachable("Illegal FP comparison");
6445     case ISD::SETUNE:
6446     case ISD::SETNE:
6447       if (ST->hasMVEFloatOps()) {
6448         Opc = ARMCC::NE; break;
6449       } else {
6450         Invert = true; LLVM_FALLTHROUGH;
6451       }
6452     case ISD::SETOEQ:
6453     case ISD::SETEQ:  Opc = ARMCC::EQ; break;
6454     case ISD::SETOLT:
6455     case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
6456     case ISD::SETOGT:
6457     case ISD::SETGT:  Opc = ARMCC::GT; break;
6458     case ISD::SETOLE:
6459     case ISD::SETLE:  Swap = true; LLVM_FALLTHROUGH;
6460     case ISD::SETOGE:
6461     case ISD::SETGE: Opc = ARMCC::GE; break;
6462     case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH;
6463     case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;
6464     case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH;
6465     case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;
6466     case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH;
6467     case ISD::SETONE: {
6468       // Expand this to (OLT | OGT).
6469       SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6470                                    DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6471       SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6472                                    DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6473       SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6474       if (Invert)
6475         Result = DAG.getNOT(dl, Result, VT);
6476       return Result;
6477     }
6478     case ISD::SETUO: Invert = true; LLVM_FALLTHROUGH;
6479     case ISD::SETO: {
6480       // Expand this to (OLT | OGE).
6481       SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6482                                    DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6483       SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6484                                    DAG.getConstant(ARMCC::GE, dl, MVT::i32));
6485       SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6486       if (Invert)
6487         Result = DAG.getNOT(dl, Result, VT);
6488       return Result;
6489     }
6490     }
6491   } else {
6492     // Integer comparisons.
6493     switch (SetCCOpcode) {
6494     default: llvm_unreachable("Illegal integer comparison");
6495     case ISD::SETNE:
6496       if (ST->hasMVEIntegerOps()) {
6497         Opc = ARMCC::NE; break;
6498       } else {
6499         Invert = true; LLVM_FALLTHROUGH;
6500       }
6501     case ISD::SETEQ:  Opc = ARMCC::EQ; break;
6502     case ISD::SETLT:  Swap = true; LLVM_FALLTHROUGH;
6503     case ISD::SETGT:  Opc = ARMCC::GT; break;
6504     case ISD::SETLE:  Swap = true; LLVM_FALLTHROUGH;
6505     case ISD::SETGE:  Opc = ARMCC::GE; break;
6506     case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
6507     case ISD::SETUGT: Opc = ARMCC::HI; break;
6508     case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
6509     case ISD::SETUGE: Opc = ARMCC::HS; break;
6510     }
6511 
6512     // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
6513     if (ST->hasNEON() && Opc == ARMCC::EQ) {
6514       SDValue AndOp;
6515       if (ISD::isBuildVectorAllZeros(Op1.getNode()))
6516         AndOp = Op0;
6517       else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
6518         AndOp = Op1;
6519 
6520       // Ignore bitconvert.
6521       if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
6522         AndOp = AndOp.getOperand(0);
6523 
6524       if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
6525         Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
6526         Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
6527         SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1);
6528         if (!Invert)
6529           Result = DAG.getNOT(dl, Result, VT);
6530         return Result;
6531       }
6532     }
6533   }
6534 
6535   if (Swap)
6536     std::swap(Op0, Op1);
6537 
6538   // If one of the operands is a constant vector zero, attempt to fold the
6539   // comparison to a specialized compare-against-zero form.
6540   SDValue SingleOp;
6541   if (ISD::isBuildVectorAllZeros(Op1.getNode()))
6542     SingleOp = Op0;
6543   else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
6544     if (Opc == ARMCC::GE)
6545       Opc = ARMCC::LE;
6546     else if (Opc == ARMCC::GT)
6547       Opc = ARMCC::LT;
6548     SingleOp = Op1;
6549   }
6550 
6551   SDValue Result;
6552   if (SingleOp.getNode()) {
6553     Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, SingleOp,
6554                          DAG.getConstant(Opc, dl, MVT::i32));
6555   } else {
6556     Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6557                          DAG.getConstant(Opc, dl, MVT::i32));
6558   }
6559 
6560   Result = DAG.getSExtOrTrunc(Result, dl, VT);
6561 
6562   if (Invert)
6563     Result = DAG.getNOT(dl, Result, VT);
6564 
6565   return Result;
6566 }
6567 
6568 static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) {
6569   SDValue LHS = Op.getOperand(0);
6570   SDValue RHS = Op.getOperand(1);
6571   SDValue Carry = Op.getOperand(2);
6572   SDValue Cond = Op.getOperand(3);
6573   SDLoc DL(Op);
6574 
6575   assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
6576 
6577   // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we
6578   // have to invert the carry first.
6579   Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
6580                       DAG.getConstant(1, DL, MVT::i32), Carry);
6581   // This converts the boolean value carry into the carry flag.
6582   Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
6583 
6584   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
6585   SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
6586 
6587   SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
6588   SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
6589   SDValue ARMcc = DAG.getConstant(
6590       IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
6591   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
6592   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR,
6593                                    Cmp.getValue(1), SDValue());
6594   return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
6595                      CCR, Chain.getValue(1));
6596 }
6597 
6598 /// isVMOVModifiedImm - Check if the specified splat value corresponds to a
6599 /// valid vector constant for a NEON or MVE instruction with a "modified
6600 /// immediate" operand (e.g., VMOV).  If so, return the encoded value.
6601 static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
6602                                  unsigned SplatBitSize, SelectionDAG &DAG,
6603                                  const SDLoc &dl, EVT &VT, EVT VectorVT,
6604                                  VMOVModImmType type) {
6605   unsigned OpCmode, Imm;
6606   bool is128Bits = VectorVT.is128BitVector();
6607 
6608   // SplatBitSize is set to the smallest size that splats the vector, so a
6609   // zero vector will always have SplatBitSize == 8.  However, NEON modified
6610   // immediate instructions others than VMOV do not support the 8-bit encoding
6611   // of a zero vector, and the default encoding of zero is supposed to be the
6612   // 32-bit version.
6613   if (SplatBits == 0)
6614     SplatBitSize = 32;
6615 
6616   switch (SplatBitSize) {
6617   case 8:
6618     if (type != VMOVModImm)
6619       return SDValue();
6620     // Any 1-byte value is OK.  Op=0, Cmode=1110.
6621     assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
6622     OpCmode = 0xe;
6623     Imm = SplatBits;
6624     VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
6625     break;
6626 
6627   case 16:
6628     // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
6629     VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
6630     if ((SplatBits & ~0xff) == 0) {
6631       // Value = 0x00nn: Op=x, Cmode=100x.
6632       OpCmode = 0x8;
6633       Imm = SplatBits;
6634       break;
6635     }
6636     if ((SplatBits & ~0xff00) == 0) {
6637       // Value = 0xnn00: Op=x, Cmode=101x.
6638       OpCmode = 0xa;
6639       Imm = SplatBits >> 8;
6640       break;
6641     }
6642     return SDValue();
6643 
6644   case 32:
6645     // NEON's 32-bit VMOV supports splat values where:
6646     // * only one byte is nonzero, or
6647     // * the least significant byte is 0xff and the second byte is nonzero, or
6648     // * the least significant 2 bytes are 0xff and the third is nonzero.
6649     VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
6650     if ((SplatBits & ~0xff) == 0) {
6651       // Value = 0x000000nn: Op=x, Cmode=000x.
6652       OpCmode = 0;
6653       Imm = SplatBits;
6654       break;
6655     }
6656     if ((SplatBits & ~0xff00) == 0) {
6657       // Value = 0x0000nn00: Op=x, Cmode=001x.
6658       OpCmode = 0x2;
6659       Imm = SplatBits >> 8;
6660       break;
6661     }
6662     if ((SplatBits & ~0xff0000) == 0) {
6663       // Value = 0x00nn0000: Op=x, Cmode=010x.
6664       OpCmode = 0x4;
6665       Imm = SplatBits >> 16;
6666       break;
6667     }
6668     if ((SplatBits & ~0xff000000) == 0) {
6669       // Value = 0xnn000000: Op=x, Cmode=011x.
6670       OpCmode = 0x6;
6671       Imm = SplatBits >> 24;
6672       break;
6673     }
6674 
6675     // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
6676     if (type == OtherModImm) return SDValue();
6677 
6678     if ((SplatBits & ~0xffff) == 0 &&
6679         ((SplatBits | SplatUndef) & 0xff) == 0xff) {
6680       // Value = 0x0000nnff: Op=x, Cmode=1100.
6681       OpCmode = 0xc;
6682       Imm = SplatBits >> 8;
6683       break;
6684     }
6685 
6686     // cmode == 0b1101 is not supported for MVE VMVN
6687     if (type == MVEVMVNModImm)
6688       return SDValue();
6689 
6690     if ((SplatBits & ~0xffffff) == 0 &&
6691         ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
6692       // Value = 0x00nnffff: Op=x, Cmode=1101.
6693       OpCmode = 0xd;
6694       Imm = SplatBits >> 16;
6695       break;
6696     }
6697 
6698     // Note: there are a few 32-bit splat values (specifically: 00ffff00,
6699     // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
6700     // VMOV.I32.  A (very) minor optimization would be to replicate the value
6701     // and fall through here to test for a valid 64-bit splat.  But, then the
6702     // caller would also need to check and handle the change in size.
6703     return SDValue();
6704 
6705   case 64: {
6706     if (type != VMOVModImm)
6707       return SDValue();
6708     // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
6709     uint64_t BitMask = 0xff;
6710     uint64_t Val = 0;
6711     unsigned ImmMask = 1;
6712     Imm = 0;
6713     for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
6714       if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
6715         Val |= BitMask;
6716         Imm |= ImmMask;
6717       } else if ((SplatBits & BitMask) != 0) {
6718         return SDValue();
6719       }
6720       BitMask <<= 8;
6721       ImmMask <<= 1;
6722     }
6723 
6724     if (DAG.getDataLayout().isBigEndian()) {
6725       // Reverse the order of elements within the vector.
6726       unsigned BytesPerElem = VectorVT.getScalarSizeInBits() / 8;
6727       unsigned Mask = (1 << BytesPerElem) - 1;
6728       unsigned NumElems = 8 / BytesPerElem;
6729       unsigned NewImm = 0;
6730       for (unsigned ElemNum = 0; ElemNum < NumElems; ++ElemNum) {
6731         unsigned Elem = ((Imm >> ElemNum * BytesPerElem) & Mask);
6732         NewImm |= Elem << (NumElems - ElemNum - 1) * BytesPerElem;
6733       }
6734       Imm = NewImm;
6735     }
6736 
6737     // Op=1, Cmode=1110.
6738     OpCmode = 0x1e;
6739     VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
6740     break;
6741   }
6742 
6743   default:
6744     llvm_unreachable("unexpected size for isVMOVModifiedImm");
6745   }
6746 
6747   unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm);
6748   return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
6749 }
6750 
6751 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
6752                                            const ARMSubtarget *ST) const {
6753   EVT VT = Op.getValueType();
6754   bool IsDouble = (VT == MVT::f64);
6755   ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
6756   const APFloat &FPVal = CFP->getValueAPF();
6757 
6758   // Prevent floating-point constants from using literal loads
6759   // when execute-only is enabled.
6760   if (ST->genExecuteOnly()) {
6761     // If we can represent the constant as an immediate, don't lower it
6762     if (isFPImmLegal(FPVal, VT))
6763       return Op;
6764     // Otherwise, construct as integer, and move to float register
6765     APInt INTVal = FPVal.bitcastToAPInt();
6766     SDLoc DL(CFP);
6767     switch (VT.getSimpleVT().SimpleTy) {
6768       default:
6769         llvm_unreachable("Unknown floating point type!");
6770         break;
6771       case MVT::f64: {
6772         SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
6773         SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
6774         return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
6775       }
6776       case MVT::f32:
6777           return DAG.getNode(ARMISD::VMOVSR, DL, VT,
6778               DAG.getConstant(INTVal, DL, MVT::i32));
6779     }
6780   }
6781 
6782   if (!ST->hasVFP3Base())
6783     return SDValue();
6784 
6785   // Use the default (constant pool) lowering for double constants when we have
6786   // an SP-only FPU
6787   if (IsDouble && !Subtarget->hasFP64())
6788     return SDValue();
6789 
6790   // Try splatting with a VMOV.f32...
6791   int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
6792 
6793   if (ImmVal != -1) {
6794     if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
6795       // We have code in place to select a valid ConstantFP already, no need to
6796       // do any mangling.
6797       return Op;
6798     }
6799 
6800     // It's a float and we are trying to use NEON operations where
6801     // possible. Lower it to a splat followed by an extract.
6802     SDLoc DL(Op);
6803     SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
6804     SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
6805                                       NewVal);
6806     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
6807                        DAG.getConstant(0, DL, MVT::i32));
6808   }
6809 
6810   // The rest of our options are NEON only, make sure that's allowed before
6811   // proceeding..
6812   if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
6813     return SDValue();
6814 
6815   EVT VMovVT;
6816   uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
6817 
6818   // It wouldn't really be worth bothering for doubles except for one very
6819   // important value, which does happen to match: 0.0. So make sure we don't do
6820   // anything stupid.
6821   if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
6822     return SDValue();
6823 
6824   // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
6825   SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
6826                                      VMovVT, VT, VMOVModImm);
6827   if (NewVal != SDValue()) {
6828     SDLoc DL(Op);
6829     SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
6830                                       NewVal);
6831     if (IsDouble)
6832       return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
6833 
6834     // It's a float: cast and extract a vector element.
6835     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
6836                                        VecConstant);
6837     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
6838                        DAG.getConstant(0, DL, MVT::i32));
6839   }
6840 
6841   // Finally, try a VMVN.i32
6842   NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
6843                              VT, VMVNModImm);
6844   if (NewVal != SDValue()) {
6845     SDLoc DL(Op);
6846     SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
6847 
6848     if (IsDouble)
6849       return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
6850 
6851     // It's a float: cast and extract a vector element.
6852     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
6853                                        VecConstant);
6854     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
6855                        DAG.getConstant(0, DL, MVT::i32));
6856   }
6857 
6858   return SDValue();
6859 }
6860 
6861 // check if an VEXT instruction can handle the shuffle mask when the
6862 // vector sources of the shuffle are the same.
6863 static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
6864   unsigned NumElts = VT.getVectorNumElements();
6865 
6866   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
6867   if (M[0] < 0)
6868     return false;
6869 
6870   Imm = M[0];
6871 
6872   // If this is a VEXT shuffle, the immediate value is the index of the first
6873   // element.  The other shuffle indices must be the successive elements after
6874   // the first one.
6875   unsigned ExpectedElt = Imm;
6876   for (unsigned i = 1; i < NumElts; ++i) {
6877     // Increment the expected index.  If it wraps around, just follow it
6878     // back to index zero and keep going.
6879     ++ExpectedElt;
6880     if (ExpectedElt == NumElts)
6881       ExpectedElt = 0;
6882 
6883     if (M[i] < 0) continue; // ignore UNDEF indices
6884     if (ExpectedElt != static_cast<unsigned>(M[i]))
6885       return false;
6886   }
6887 
6888   return true;
6889 }
6890 
6891 static bool isVEXTMask(ArrayRef<int> M, EVT VT,
6892                        bool &ReverseVEXT, unsigned &Imm) {
6893   unsigned NumElts = VT.getVectorNumElements();
6894   ReverseVEXT = false;
6895 
6896   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
6897   if (M[0] < 0)
6898     return false;
6899 
6900   Imm = M[0];
6901 
6902   // If this is a VEXT shuffle, the immediate value is the index of the first
6903   // element.  The other shuffle indices must be the successive elements after
6904   // the first one.
6905   unsigned ExpectedElt = Imm;
6906   for (unsigned i = 1; i < NumElts; ++i) {
6907     // Increment the expected index.  If it wraps around, it may still be
6908     // a VEXT but the source vectors must be swapped.
6909     ExpectedElt += 1;
6910     if (ExpectedElt == NumElts * 2) {
6911       ExpectedElt = 0;
6912       ReverseVEXT = true;
6913     }
6914 
6915     if (M[i] < 0) continue; // ignore UNDEF indices
6916     if (ExpectedElt != static_cast<unsigned>(M[i]))
6917       return false;
6918   }
6919 
6920   // Adjust the index value if the source operands will be swapped.
6921   if (ReverseVEXT)
6922     Imm -= NumElts;
6923 
6924   return true;
6925 }
6926 
6927 /// isVREVMask - Check if a vector shuffle corresponds to a VREV
6928 /// instruction with the specified blocksize.  (The order of the elements
6929 /// within each block of the vector is reversed.)
6930 static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
6931   assert((BlockSize==16 || BlockSize==32 || BlockSize==64) &&
6932          "Only possible block sizes for VREV are: 16, 32, 64");
6933 
6934   unsigned EltSz = VT.getScalarSizeInBits();
6935   if (EltSz == 64)
6936     return false;
6937 
6938   unsigned NumElts = VT.getVectorNumElements();
6939   unsigned BlockElts = M[0] + 1;
6940   // If the first shuffle index is UNDEF, be optimistic.
6941   if (M[0] < 0)
6942     BlockElts = BlockSize / EltSz;
6943 
6944   if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
6945     return false;
6946 
6947   for (unsigned i = 0; i < NumElts; ++i) {
6948     if (M[i] < 0) continue; // ignore UNDEF indices
6949     if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts))
6950       return false;
6951   }
6952 
6953   return true;
6954 }
6955 
6956 static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
6957   // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
6958   // range, then 0 is placed into the resulting vector. So pretty much any mask
6959   // of 8 elements can work here.
6960   return VT == MVT::v8i8 && M.size() == 8;
6961 }
6962 
6963 static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask,
6964                                unsigned Index) {
6965   if (Mask.size() == Elements * 2)
6966     return Index / Elements;
6967   return Mask[Index] == 0 ? 0 : 1;
6968 }
6969 
6970 // Checks whether the shuffle mask represents a vector transpose (VTRN) by
6971 // checking that pairs of elements in the shuffle mask represent the same index
6972 // in each vector, incrementing the expected index by 2 at each step.
6973 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
6974 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
6975 //  v2={e,f,g,h}
6976 // WhichResult gives the offset for each element in the mask based on which
6977 // of the two results it belongs to.
6978 //
6979 // The transpose can be represented either as:
6980 // result1 = shufflevector v1, v2, result1_shuffle_mask
6981 // result2 = shufflevector v1, v2, result2_shuffle_mask
6982 // where v1/v2 and the shuffle masks have the same number of elements
6983 // (here WhichResult (see below) indicates which result is being checked)
6984 //
6985 // or as:
6986 // results = shufflevector v1, v2, shuffle_mask
6987 // where both results are returned in one vector and the shuffle mask has twice
6988 // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
6989 // want to check the low half and high half of the shuffle mask as if it were
6990 // the other case
6991 static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
6992   unsigned EltSz = VT.getScalarSizeInBits();
6993   if (EltSz == 64)
6994     return false;
6995 
6996   unsigned NumElts = VT.getVectorNumElements();
6997   if (M.size() != NumElts && M.size() != NumElts*2)
6998     return false;
6999 
7000   // If the mask is twice as long as the input vector then we need to check the
7001   // upper and lower parts of the mask with a matching value for WhichResult
7002   // FIXME: A mask with only even values will be rejected in case the first
7003   // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
7004   // M[0] is used to determine WhichResult
7005   for (unsigned i = 0; i < M.size(); i += NumElts) {
7006     WhichResult = SelectPairHalf(NumElts, M, i);
7007     for (unsigned j = 0; j < NumElts; j += 2) {
7008       if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7009           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
7010         return false;
7011     }
7012   }
7013 
7014   if (M.size() == NumElts*2)
7015     WhichResult = 0;
7016 
7017   return true;
7018 }
7019 
7020 /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
7021 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7022 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
7023 static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7024   unsigned EltSz = VT.getScalarSizeInBits();
7025   if (EltSz == 64)
7026     return false;
7027 
7028   unsigned NumElts = VT.getVectorNumElements();
7029   if (M.size() != NumElts && M.size() != NumElts*2)
7030     return false;
7031 
7032   for (unsigned i = 0; i < M.size(); i += NumElts) {
7033     WhichResult = SelectPairHalf(NumElts, M, i);
7034     for (unsigned j = 0; j < NumElts; j += 2) {
7035       if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7036           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
7037         return false;
7038     }
7039   }
7040 
7041   if (M.size() == NumElts*2)
7042     WhichResult = 0;
7043 
7044   return true;
7045 }
7046 
7047 // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
7048 // that the mask elements are either all even and in steps of size 2 or all odd
7049 // and in steps of size 2.
7050 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
7051 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
7052 //  v2={e,f,g,h}
7053 // Requires similar checks to that of isVTRNMask with
7054 // respect the how results are returned.
7055 static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7056   unsigned EltSz = VT.getScalarSizeInBits();
7057   if (EltSz == 64)
7058     return false;
7059 
7060   unsigned NumElts = VT.getVectorNumElements();
7061   if (M.size() != NumElts && M.size() != NumElts*2)
7062     return false;
7063 
7064   for (unsigned i = 0; i < M.size(); i += NumElts) {
7065     WhichResult = SelectPairHalf(NumElts, M, i);
7066     for (unsigned j = 0; j < NumElts; ++j) {
7067       if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
7068         return false;
7069     }
7070   }
7071 
7072   if (M.size() == NumElts*2)
7073     WhichResult = 0;
7074 
7075   // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7076   if (VT.is64BitVector() && EltSz == 32)
7077     return false;
7078 
7079   return true;
7080 }
7081 
7082 /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
7083 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7084 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
7085 static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7086   unsigned EltSz = VT.getScalarSizeInBits();
7087   if (EltSz == 64)
7088     return false;
7089 
7090   unsigned NumElts = VT.getVectorNumElements();
7091   if (M.size() != NumElts && M.size() != NumElts*2)
7092     return false;
7093 
7094   unsigned Half = NumElts / 2;
7095   for (unsigned i = 0; i < M.size(); i += NumElts) {
7096     WhichResult = SelectPairHalf(NumElts, M, i);
7097     for (unsigned j = 0; j < NumElts; j += Half) {
7098       unsigned Idx = WhichResult;
7099       for (unsigned k = 0; k < Half; ++k) {
7100         int MIdx = M[i + j + k];
7101         if (MIdx >= 0 && (unsigned) MIdx != Idx)
7102           return false;
7103         Idx += 2;
7104       }
7105     }
7106   }
7107 
7108   if (M.size() == NumElts*2)
7109     WhichResult = 0;
7110 
7111   // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7112   if (VT.is64BitVector() && EltSz == 32)
7113     return false;
7114 
7115   return true;
7116 }
7117 
7118 // Checks whether the shuffle mask represents a vector zip (VZIP) by checking
7119 // that pairs of elements of the shufflemask represent the same index in each
7120 // vector incrementing sequentially through the vectors.
7121 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
7122 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
7123 //  v2={e,f,g,h}
7124 // Requires similar checks to that of isVTRNMask with respect the how results
7125 // are returned.
7126 static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7127   unsigned EltSz = VT.getScalarSizeInBits();
7128   if (EltSz == 64)
7129     return false;
7130 
7131   unsigned NumElts = VT.getVectorNumElements();
7132   if (M.size() != NumElts && M.size() != NumElts*2)
7133     return false;
7134 
7135   for (unsigned i = 0; i < M.size(); i += NumElts) {
7136     WhichResult = SelectPairHalf(NumElts, M, i);
7137     unsigned Idx = WhichResult * NumElts / 2;
7138     for (unsigned j = 0; j < NumElts; j += 2) {
7139       if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7140           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
7141         return false;
7142       Idx += 1;
7143     }
7144   }
7145 
7146   if (M.size() == NumElts*2)
7147     WhichResult = 0;
7148 
7149   // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7150   if (VT.is64BitVector() && EltSz == 32)
7151     return false;
7152 
7153   return true;
7154 }
7155 
7156 /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
7157 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7158 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
7159 static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7160   unsigned EltSz = VT.getScalarSizeInBits();
7161   if (EltSz == 64)
7162     return false;
7163 
7164   unsigned NumElts = VT.getVectorNumElements();
7165   if (M.size() != NumElts && M.size() != NumElts*2)
7166     return false;
7167 
7168   for (unsigned i = 0; i < M.size(); i += NumElts) {
7169     WhichResult = SelectPairHalf(NumElts, M, i);
7170     unsigned Idx = WhichResult * NumElts / 2;
7171     for (unsigned j = 0; j < NumElts; j += 2) {
7172       if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7173           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
7174         return false;
7175       Idx += 1;
7176     }
7177   }
7178 
7179   if (M.size() == NumElts*2)
7180     WhichResult = 0;
7181 
7182   // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7183   if (VT.is64BitVector() && EltSz == 32)
7184     return false;
7185 
7186   return true;
7187 }
7188 
7189 /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
7190 /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
7191 static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
7192                                            unsigned &WhichResult,
7193                                            bool &isV_UNDEF) {
7194   isV_UNDEF = false;
7195   if (isVTRNMask(ShuffleMask, VT, WhichResult))
7196     return ARMISD::VTRN;
7197   if (isVUZPMask(ShuffleMask, VT, WhichResult))
7198     return ARMISD::VUZP;
7199   if (isVZIPMask(ShuffleMask, VT, WhichResult))
7200     return ARMISD::VZIP;
7201 
7202   isV_UNDEF = true;
7203   if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
7204     return ARMISD::VTRN;
7205   if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7206     return ARMISD::VUZP;
7207   if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7208     return ARMISD::VZIP;
7209 
7210   return 0;
7211 }
7212 
7213 /// \return true if this is a reverse operation on an vector.
7214 static bool isReverseMask(ArrayRef<int> M, EVT VT) {
7215   unsigned NumElts = VT.getVectorNumElements();
7216   // Make sure the mask has the right size.
7217   if (NumElts != M.size())
7218       return false;
7219 
7220   // Look for <15, ..., 3, -1, 1, 0>.
7221   for (unsigned i = 0; i != NumElts; ++i)
7222     if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
7223       return false;
7224 
7225   return true;
7226 }
7227 
7228 static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top) {
7229   unsigned NumElts = VT.getVectorNumElements();
7230   // Make sure the mask has the right size.
7231   if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7232       return false;
7233 
7234   // If Top
7235   //   Look for <0, N, 2, N+2, 4, N+4, ..>.
7236   //   This inserts Input2 into Input1
7237   // else if not Top
7238   //   Look for <0, N+1, 2, N+3, 4, N+5, ..>
7239   //   This inserts Input1 into Input2
7240   unsigned Offset = Top ? 0 : 1;
7241   for (unsigned i = 0; i < NumElts; i+=2) {
7242     if (M[i] >= 0 && M[i] != (int)i)
7243       return false;
7244     if (M[i+1] >= 0 && M[i+1] != (int)(NumElts + i + Offset))
7245       return false;
7246   }
7247 
7248   return true;
7249 }
7250 
7251 // Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted
7252 // from a pair of inputs. For example:
7253 // BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7254 //             FP_ROUND(EXTRACT_ELT(Y, 0),
7255 //             FP_ROUND(EXTRACT_ELT(X, 1),
7256 //             FP_ROUND(EXTRACT_ELT(Y, 1), ...)
7257 static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG,
7258                                          const ARMSubtarget *ST) {
7259   assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7260   if (!ST->hasMVEFloatOps())
7261     return SDValue();
7262 
7263   SDLoc dl(BV);
7264   EVT VT = BV.getValueType();
7265   if (VT != MVT::v8f16)
7266     return SDValue();
7267 
7268   // We are looking for a buildvector of fptrunc elements, where all the
7269   // elements are interleavingly extracted from two sources. Check the first two
7270   // items are valid enough and extract some info from them (they are checked
7271   // properly in the loop below).
7272   if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND ||
7273       BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7274       BV.getOperand(0).getOperand(0).getConstantOperandVal(1) != 0)
7275     return SDValue();
7276   if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND ||
7277       BV.getOperand(1).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7278       BV.getOperand(1).getOperand(0).getConstantOperandVal(1) != 0)
7279     return SDValue();
7280   SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7281   SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0);
7282   if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32)
7283     return SDValue();
7284 
7285   // Check all the values in the BuildVector line up with our expectations.
7286   for (unsigned i = 1; i < 4; i++) {
7287     auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7288       return Trunc.getOpcode() == ISD::FP_ROUND &&
7289              Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7290              Trunc.getOperand(0).getOperand(0) == Op &&
7291              Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7292     };
7293     if (!Check(BV.getOperand(i * 2 + 0), Op0, i))
7294       return SDValue();
7295     if (!Check(BV.getOperand(i * 2 + 1), Op1, i))
7296       return SDValue();
7297   }
7298 
7299   SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0,
7300                            DAG.getConstant(0, dl, MVT::i32));
7301   return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1,
7302                      DAG.getConstant(1, dl, MVT::i32));
7303 }
7304 
7305 // Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted
7306 // from a single input on alternating lanes. For example:
7307 // BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7308 //             FP_ROUND(EXTRACT_ELT(X, 2),
7309 //             FP_ROUND(EXTRACT_ELT(X, 4), ...)
7310 static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG,
7311                                        const ARMSubtarget *ST) {
7312   assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7313   if (!ST->hasMVEFloatOps())
7314     return SDValue();
7315 
7316   SDLoc dl(BV);
7317   EVT VT = BV.getValueType();
7318   if (VT != MVT::v4f32)
7319     return SDValue();
7320 
7321   // We are looking for a buildvector of fptext elements, where all the
7322   // elements are alternating lanes from a single source. For example <0,2,4,6>
7323   // or <1,3,5,7>. Check the first two items are valid enough and extract some
7324   // info from them (they are checked properly in the loop below).
7325   if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND ||
7326       BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT)
7327     return SDValue();
7328   SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7329   int Offset = BV.getOperand(0).getOperand(0).getConstantOperandVal(1);
7330   if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1))
7331     return SDValue();
7332 
7333   // Check all the values in the BuildVector line up with our expectations.
7334   for (unsigned i = 1; i < 4; i++) {
7335     auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7336       return Trunc.getOpcode() == ISD::FP_EXTEND &&
7337              Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7338              Trunc.getOperand(0).getOperand(0) == Op &&
7339              Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7340     };
7341     if (!Check(BV.getOperand(i), Op0, 2 * i + Offset))
7342       return SDValue();
7343   }
7344 
7345   return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0,
7346                      DAG.getConstant(Offset, dl, MVT::i32));
7347 }
7348 
7349 // If N is an integer constant that can be moved into a register in one
7350 // instruction, return an SDValue of such a constant (will become a MOV
7351 // instruction).  Otherwise return null.
7352 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
7353                                      const ARMSubtarget *ST, const SDLoc &dl) {
7354   uint64_t Val;
7355   if (!isa<ConstantSDNode>(N))
7356     return SDValue();
7357   Val = cast<ConstantSDNode>(N)->getZExtValue();
7358 
7359   if (ST->isThumb1Only()) {
7360     if (Val <= 255 || ~Val <= 255)
7361       return DAG.getConstant(Val, dl, MVT::i32);
7362   } else {
7363     if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
7364       return DAG.getConstant(Val, dl, MVT::i32);
7365   }
7366   return SDValue();
7367 }
7368 
7369 static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG,
7370                                     const ARMSubtarget *ST) {
7371   SDLoc dl(Op);
7372   EVT VT = Op.getValueType();
7373 
7374   assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!");
7375 
7376   unsigned NumElts = VT.getVectorNumElements();
7377   unsigned BoolMask;
7378   unsigned BitsPerBool;
7379   if (NumElts == 4) {
7380     BitsPerBool = 4;
7381     BoolMask = 0xf;
7382   } else if (NumElts == 8) {
7383     BitsPerBool = 2;
7384     BoolMask = 0x3;
7385   } else if (NumElts == 16) {
7386     BitsPerBool = 1;
7387     BoolMask = 0x1;
7388   } else
7389     return SDValue();
7390 
7391   // If this is a single value copied into all lanes (a splat), we can just sign
7392   // extend that single value
7393   SDValue FirstOp = Op.getOperand(0);
7394   if (!isa<ConstantSDNode>(FirstOp) &&
7395       std::all_of(std::next(Op->op_begin()), Op->op_end(),
7396                   [&FirstOp](SDUse &U) {
7397                     return U.get().isUndef() || U.get() == FirstOp;
7398                   })) {
7399     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,
7400                               DAG.getValueType(MVT::i1));
7401     return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext);
7402   }
7403 
7404   // First create base with bits set where known
7405   unsigned Bits32 = 0;
7406   for (unsigned i = 0; i < NumElts; ++i) {
7407     SDValue V = Op.getOperand(i);
7408     if (!isa<ConstantSDNode>(V) && !V.isUndef())
7409       continue;
7410     bool BitSet = V.isUndef() ? false : cast<ConstantSDNode>(V)->getZExtValue();
7411     if (BitSet)
7412       Bits32 |= BoolMask << (i * BitsPerBool);
7413   }
7414 
7415   // Add in unknown nodes
7416   SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
7417                              DAG.getConstant(Bits32, dl, MVT::i32));
7418   for (unsigned i = 0; i < NumElts; ++i) {
7419     SDValue V = Op.getOperand(i);
7420     if (isa<ConstantSDNode>(V) || V.isUndef())
7421       continue;
7422     Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V,
7423                        DAG.getConstant(i, dl, MVT::i32));
7424   }
7425 
7426   return Base;
7427 }
7428 
7429 // If this is a case we can't handle, return null and let the default
7430 // expansion code take care of it.
7431 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
7432                                              const ARMSubtarget *ST) const {
7433   BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
7434   SDLoc dl(Op);
7435   EVT VT = Op.getValueType();
7436 
7437   if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
7438     return LowerBUILD_VECTOR_i1(Op, DAG, ST);
7439 
7440   APInt SplatBits, SplatUndef;
7441   unsigned SplatBitSize;
7442   bool HasAnyUndefs;
7443   if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
7444     if (SplatUndef.isAllOnesValue())
7445       return DAG.getUNDEF(VT);
7446 
7447     if ((ST->hasNEON() && SplatBitSize <= 64) ||
7448         (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) {
7449       // Check if an immediate VMOV works.
7450       EVT VmovVT;
7451       SDValue Val =
7452           isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
7453                             SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm);
7454 
7455       if (Val.getNode()) {
7456         SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
7457         return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
7458       }
7459 
7460       // Try an immediate VMVN.
7461       uint64_t NegatedImm = (~SplatBits).getZExtValue();
7462       Val = isVMOVModifiedImm(
7463           NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT,
7464           VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
7465       if (Val.getNode()) {
7466         SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
7467         return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
7468       }
7469 
7470       // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
7471       if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
7472         int ImmVal = ARM_AM::getFP32Imm(SplatBits);
7473         if (ImmVal != -1) {
7474           SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
7475           return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
7476         }
7477       }
7478     }
7479   }
7480 
7481   // Scan through the operands to see if only one value is used.
7482   //
7483   // As an optimisation, even if more than one value is used it may be more
7484   // profitable to splat with one value then change some lanes.
7485   //
7486   // Heuristically we decide to do this if the vector has a "dominant" value,
7487   // defined as splatted to more than half of the lanes.
7488   unsigned NumElts = VT.getVectorNumElements();
7489   bool isOnlyLowElement = true;
7490   bool usesOnlyOneValue = true;
7491   bool hasDominantValue = false;
7492   bool isConstant = true;
7493 
7494   // Map of the number of times a particular SDValue appears in the
7495   // element list.
7496   DenseMap<SDValue, unsigned> ValueCounts;
7497   SDValue Value;
7498   for (unsigned i = 0; i < NumElts; ++i) {
7499     SDValue V = Op.getOperand(i);
7500     if (V.isUndef())
7501       continue;
7502     if (i > 0)
7503       isOnlyLowElement = false;
7504     if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
7505       isConstant = false;
7506 
7507     ValueCounts.insert(std::make_pair(V, 0));
7508     unsigned &Count = ValueCounts[V];
7509 
7510     // Is this value dominant? (takes up more than half of the lanes)
7511     if (++Count > (NumElts / 2)) {
7512       hasDominantValue = true;
7513       Value = V;
7514     }
7515   }
7516   if (ValueCounts.size() != 1)
7517     usesOnlyOneValue = false;
7518   if (!Value.getNode() && !ValueCounts.empty())
7519     Value = ValueCounts.begin()->first;
7520 
7521   if (ValueCounts.empty())
7522     return DAG.getUNDEF(VT);
7523 
7524   // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
7525   // Keep going if we are hitting this case.
7526   if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
7527     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
7528 
7529   unsigned EltSize = VT.getScalarSizeInBits();
7530 
7531   // Use VDUP for non-constant splats.  For f32 constant splats, reduce to
7532   // i32 and try again.
7533   if (hasDominantValue && EltSize <= 32) {
7534     if (!isConstant) {
7535       SDValue N;
7536 
7537       // If we are VDUPing a value that comes directly from a vector, that will
7538       // cause an unnecessary move to and from a GPR, where instead we could
7539       // just use VDUPLANE. We can only do this if the lane being extracted
7540       // is at a constant index, as the VDUP from lane instructions only have
7541       // constant-index forms.
7542       ConstantSDNode *constIndex;
7543       if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7544           (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
7545         // We need to create a new undef vector to use for the VDUPLANE if the
7546         // size of the vector from which we get the value is different than the
7547         // size of the vector that we need to create. We will insert the element
7548         // such that the register coalescer will remove unnecessary copies.
7549         if (VT != Value->getOperand(0).getValueType()) {
7550           unsigned index = constIndex->getAPIntValue().getLimitedValue() %
7551                              VT.getVectorNumElements();
7552           N =  DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7553                  DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
7554                         Value, DAG.getConstant(index, dl, MVT::i32)),
7555                            DAG.getConstant(index, dl, MVT::i32));
7556         } else
7557           N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7558                         Value->getOperand(0), Value->getOperand(1));
7559       } else
7560         N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
7561 
7562       if (!usesOnlyOneValue) {
7563         // The dominant value was splatted as 'N', but we now have to insert
7564         // all differing elements.
7565         for (unsigned I = 0; I < NumElts; ++I) {
7566           if (Op.getOperand(I) == Value)
7567             continue;
7568           SmallVector<SDValue, 3> Ops;
7569           Ops.push_back(N);
7570           Ops.push_back(Op.getOperand(I));
7571           Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
7572           N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
7573         }
7574       }
7575       return N;
7576     }
7577     if (VT.getVectorElementType().isFloatingPoint()) {
7578       SmallVector<SDValue, 8> Ops;
7579       MVT FVT = VT.getVectorElementType().getSimpleVT();
7580       assert(FVT == MVT::f32 || FVT == MVT::f16);
7581       MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;
7582       for (unsigned i = 0; i < NumElts; ++i)
7583         Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT,
7584                                   Op.getOperand(i)));
7585       EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts);
7586       SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
7587       Val = LowerBUILD_VECTOR(Val, DAG, ST);
7588       if (Val.getNode())
7589         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7590     }
7591     if (usesOnlyOneValue) {
7592       SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
7593       if (isConstant && Val.getNode())
7594         return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
7595     }
7596   }
7597 
7598   // If all elements are constants and the case above didn't get hit, fall back
7599   // to the default expansion, which will generate a load from the constant
7600   // pool.
7601   if (isConstant)
7602     return SDValue();
7603 
7604   // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and
7605   // vmovn). Empirical tests suggest this is rarely worth it for vectors of
7606   // length <= 2.
7607   if (NumElts >= 4)
7608     if (SDValue shuffle = ReconstructShuffle(Op, DAG))
7609       return shuffle;
7610 
7611   // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into
7612   // VCVT's
7613   if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget))
7614     return VCVT;
7615   if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget))
7616     return VCVT;
7617 
7618   if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
7619     // If we haven't found an efficient lowering, try splitting a 128-bit vector
7620     // into two 64-bit vectors; we might discover a better way to lower it.
7621     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
7622     EVT ExtVT = VT.getVectorElementType();
7623     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
7624     SDValue Lower =
7625         DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2));
7626     if (Lower.getOpcode() == ISD::BUILD_VECTOR)
7627       Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
7628     SDValue Upper = DAG.getBuildVector(
7629         HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2));
7630     if (Upper.getOpcode() == ISD::BUILD_VECTOR)
7631       Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
7632     if (Lower && Upper)
7633       return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
7634   }
7635 
7636   // Vectors with 32- or 64-bit elements can be built by directly assigning
7637   // the subregisters.  Lower it to an ARMISD::BUILD_VECTOR so the operands
7638   // will be legalized.
7639   if (EltSize >= 32) {
7640     // Do the expansion with floating-point types, since that is what the VFP
7641     // registers are defined to use, and since i64 is not legal.
7642     EVT EltVT = EVT::getFloatingPointVT(EltSize);
7643     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
7644     SmallVector<SDValue, 8> Ops;
7645     for (unsigned i = 0; i < NumElts; ++i)
7646       Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
7647     SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
7648     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7649   }
7650 
7651   // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
7652   // know the default expansion would otherwise fall back on something even
7653   // worse. For a vector with one or two non-undef values, that's
7654   // scalar_to_vector for the elements followed by a shuffle (provided the
7655   // shuffle is valid for the target) and materialization element by element
7656   // on the stack followed by a load for everything else.
7657   if (!isConstant && !usesOnlyOneValue) {
7658     SDValue Vec = DAG.getUNDEF(VT);
7659     for (unsigned i = 0 ; i < NumElts; ++i) {
7660       SDValue V = Op.getOperand(i);
7661       if (V.isUndef())
7662         continue;
7663       SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
7664       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
7665     }
7666     return Vec;
7667   }
7668 
7669   return SDValue();
7670 }
7671 
7672 // Gather data to see if the operation can be modelled as a
7673 // shuffle in combination with VEXTs.
7674 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
7675                                               SelectionDAG &DAG) const {
7676   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7677   SDLoc dl(Op);
7678   EVT VT = Op.getValueType();
7679   unsigned NumElts = VT.getVectorNumElements();
7680 
7681   struct ShuffleSourceInfo {
7682     SDValue Vec;
7683     unsigned MinElt = std::numeric_limits<unsigned>::max();
7684     unsigned MaxElt = 0;
7685 
7686     // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
7687     // be compatible with the shuffle we intend to construct. As a result
7688     // ShuffleVec will be some sliding window into the original Vec.
7689     SDValue ShuffleVec;
7690 
7691     // Code should guarantee that element i in Vec starts at element "WindowBase
7692     // + i * WindowScale in ShuffleVec".
7693     int WindowBase = 0;
7694     int WindowScale = 1;
7695 
7696     ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
7697 
7698     bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
7699   };
7700 
7701   // First gather all vectors used as an immediate source for this BUILD_VECTOR
7702   // node.
7703   SmallVector<ShuffleSourceInfo, 2> Sources;
7704   for (unsigned i = 0; i < NumElts; ++i) {
7705     SDValue V = Op.getOperand(i);
7706     if (V.isUndef())
7707       continue;
7708     else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
7709       // A shuffle can only come from building a vector from various
7710       // elements of other vectors.
7711       return SDValue();
7712     } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
7713       // Furthermore, shuffles require a constant mask, whereas extractelts
7714       // accept variable indices.
7715       return SDValue();
7716     }
7717 
7718     // Add this element source to the list if it's not already there.
7719     SDValue SourceVec = V.getOperand(0);
7720     auto Source = llvm::find(Sources, SourceVec);
7721     if (Source == Sources.end())
7722       Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
7723 
7724     // Update the minimum and maximum lane number seen.
7725     unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
7726     Source->MinElt = std::min(Source->MinElt, EltNo);
7727     Source->MaxElt = std::max(Source->MaxElt, EltNo);
7728   }
7729 
7730   // Currently only do something sane when at most two source vectors
7731   // are involved.
7732   if (Sources.size() > 2)
7733     return SDValue();
7734 
7735   // Find out the smallest element size among result and two sources, and use
7736   // it as element size to build the shuffle_vector.
7737   EVT SmallestEltTy = VT.getVectorElementType();
7738   for (auto &Source : Sources) {
7739     EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
7740     if (SrcEltTy.bitsLT(SmallestEltTy))
7741       SmallestEltTy = SrcEltTy;
7742   }
7743   unsigned ResMultiplier =
7744       VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
7745   NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
7746   EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
7747 
7748   // If the source vector is too wide or too narrow, we may nevertheless be able
7749   // to construct a compatible shuffle either by concatenating it with UNDEF or
7750   // extracting a suitable range of elements.
7751   for (auto &Src : Sources) {
7752     EVT SrcVT = Src.ShuffleVec.getValueType();
7753 
7754     uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
7755     uint64_t VTSize = VT.getFixedSizeInBits();
7756     if (SrcVTSize == VTSize)
7757       continue;
7758 
7759     // This stage of the search produces a source with the same element type as
7760     // the original, but with a total width matching the BUILD_VECTOR output.
7761     EVT EltVT = SrcVT.getVectorElementType();
7762     unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
7763     EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
7764 
7765     if (SrcVTSize < VTSize) {
7766       if (2 * SrcVTSize != VTSize)
7767         return SDValue();
7768       // We can pad out the smaller vector for free, so if it's part of a
7769       // shuffle...
7770       Src.ShuffleVec =
7771           DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
7772                       DAG.getUNDEF(Src.ShuffleVec.getValueType()));
7773       continue;
7774     }
7775 
7776     if (SrcVTSize != 2 * VTSize)
7777       return SDValue();
7778 
7779     if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
7780       // Span too large for a VEXT to cope
7781       return SDValue();
7782     }
7783 
7784     if (Src.MinElt >= NumSrcElts) {
7785       // The extraction can just take the second half
7786       Src.ShuffleVec =
7787           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
7788                       DAG.getConstant(NumSrcElts, dl, MVT::i32));
7789       Src.WindowBase = -NumSrcElts;
7790     } else if (Src.MaxElt < NumSrcElts) {
7791       // The extraction can just take the first half
7792       Src.ShuffleVec =
7793           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
7794                       DAG.getConstant(0, dl, MVT::i32));
7795     } else {
7796       // An actual VEXT is needed
7797       SDValue VEXTSrc1 =
7798           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
7799                       DAG.getConstant(0, dl, MVT::i32));
7800       SDValue VEXTSrc2 =
7801           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
7802                       DAG.getConstant(NumSrcElts, dl, MVT::i32));
7803 
7804       Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
7805                                    VEXTSrc2,
7806                                    DAG.getConstant(Src.MinElt, dl, MVT::i32));
7807       Src.WindowBase = -Src.MinElt;
7808     }
7809   }
7810 
7811   // Another possible incompatibility occurs from the vector element types. We
7812   // can fix this by bitcasting the source vectors to the same type we intend
7813   // for the shuffle.
7814   for (auto &Src : Sources) {
7815     EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
7816     if (SrcEltTy == SmallestEltTy)
7817       continue;
7818     assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
7819     Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
7820     Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
7821     Src.WindowBase *= Src.WindowScale;
7822   }
7823 
7824   // Final sanity check before we try to actually produce a shuffle.
7825   LLVM_DEBUG(for (auto Src
7826                   : Sources)
7827                  assert(Src.ShuffleVec.getValueType() == ShuffleVT););
7828 
7829   // The stars all align, our next step is to produce the mask for the shuffle.
7830   SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
7831   int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
7832   for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
7833     SDValue Entry = Op.getOperand(i);
7834     if (Entry.isUndef())
7835       continue;
7836 
7837     auto Src = llvm::find(Sources, Entry.getOperand(0));
7838     int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
7839 
7840     // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
7841     // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
7842     // segment.
7843     EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
7844     int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
7845                                VT.getScalarSizeInBits());
7846     int LanesDefined = BitsDefined / BitsPerShuffleLane;
7847 
7848     // This source is expected to fill ResMultiplier lanes of the final shuffle,
7849     // starting at the appropriate offset.
7850     int *LaneMask = &Mask[i * ResMultiplier];
7851 
7852     int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
7853     ExtractBase += NumElts * (Src - Sources.begin());
7854     for (int j = 0; j < LanesDefined; ++j)
7855       LaneMask[j] = ExtractBase + j;
7856   }
7857 
7858 
7859   // We can't handle more than two sources. This should have already
7860   // been checked before this point.
7861   assert(Sources.size() <= 2 && "Too many sources!");
7862 
7863   SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
7864   for (unsigned i = 0; i < Sources.size(); ++i)
7865     ShuffleOps[i] = Sources[i].ShuffleVec;
7866 
7867   SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
7868                                             ShuffleOps[1], Mask, DAG);
7869   if (!Shuffle)
7870     return SDValue();
7871   return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
7872 }
7873 
7874 enum ShuffleOpCodes {
7875   OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
7876   OP_VREV,
7877   OP_VDUP0,
7878   OP_VDUP1,
7879   OP_VDUP2,
7880   OP_VDUP3,
7881   OP_VEXT1,
7882   OP_VEXT2,
7883   OP_VEXT3,
7884   OP_VUZPL, // VUZP, left result
7885   OP_VUZPR, // VUZP, right result
7886   OP_VZIPL, // VZIP, left result
7887   OP_VZIPR, // VZIP, right result
7888   OP_VTRNL, // VTRN, left result
7889   OP_VTRNR  // VTRN, right result
7890 };
7891 
7892 static bool isLegalMVEShuffleOp(unsigned PFEntry) {
7893   unsigned OpNum = (PFEntry >> 26) & 0x0F;
7894   switch (OpNum) {
7895   case OP_COPY:
7896   case OP_VREV:
7897   case OP_VDUP0:
7898   case OP_VDUP1:
7899   case OP_VDUP2:
7900   case OP_VDUP3:
7901     return true;
7902   }
7903   return false;
7904 }
7905 
7906 /// isShuffleMaskLegal - Targets can use this to indicate that they only
7907 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
7908 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
7909 /// are assumed to be legal.
7910 bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
7911   if (VT.getVectorNumElements() == 4 &&
7912       (VT.is128BitVector() || VT.is64BitVector())) {
7913     unsigned PFIndexes[4];
7914     for (unsigned i = 0; i != 4; ++i) {
7915       if (M[i] < 0)
7916         PFIndexes[i] = 8;
7917       else
7918         PFIndexes[i] = M[i];
7919     }
7920 
7921     // Compute the index in the perfect shuffle table.
7922     unsigned PFTableIndex =
7923       PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
7924     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
7925     unsigned Cost = (PFEntry >> 30);
7926 
7927     if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry)))
7928       return true;
7929   }
7930 
7931   bool ReverseVEXT, isV_UNDEF;
7932   unsigned Imm, WhichResult;
7933 
7934   unsigned EltSize = VT.getScalarSizeInBits();
7935   if (EltSize >= 32 ||
7936       ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
7937       ShuffleVectorInst::isIdentityMask(M) ||
7938       isVREVMask(M, VT, 64) ||
7939       isVREVMask(M, VT, 32) ||
7940       isVREVMask(M, VT, 16))
7941     return true;
7942   else if (Subtarget->hasNEON() &&
7943            (isVEXTMask(M, VT, ReverseVEXT, Imm) ||
7944             isVTBLMask(M, VT) ||
7945             isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
7946     return true;
7947   else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) &&
7948            isReverseMask(M, VT))
7949     return true;
7950   else if (Subtarget->hasMVEIntegerOps() &&
7951            (isVMOVNMask(M, VT, 0) || isVMOVNMask(M, VT, 1)))
7952     return true;
7953   else
7954     return false;
7955 }
7956 
7957 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
7958 /// the specified operations to build the shuffle.
7959 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
7960                                       SDValue RHS, SelectionDAG &DAG,
7961                                       const SDLoc &dl) {
7962   unsigned OpNum = (PFEntry >> 26) & 0x0F;
7963   unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
7964   unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
7965 
7966   if (OpNum == OP_COPY) {
7967     if (LHSID == (1*9+2)*9+3) return LHS;
7968     assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
7969     return RHS;
7970   }
7971 
7972   SDValue OpLHS, OpRHS;
7973   OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
7974   OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
7975   EVT VT = OpLHS.getValueType();
7976 
7977   switch (OpNum) {
7978   default: llvm_unreachable("Unknown shuffle opcode!");
7979   case OP_VREV:
7980     // VREV divides the vector in half and swaps within the half.
7981     if (VT.getVectorElementType() == MVT::i32 ||
7982         VT.getVectorElementType() == MVT::f32)
7983       return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
7984     // vrev <4 x i16> -> VREV32
7985     if (VT.getVectorElementType() == MVT::i16)
7986       return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
7987     // vrev <4 x i8> -> VREV16
7988     assert(VT.getVectorElementType() == MVT::i8);
7989     return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
7990   case OP_VDUP0:
7991   case OP_VDUP1:
7992   case OP_VDUP2:
7993   case OP_VDUP3:
7994     return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7995                        OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
7996   case OP_VEXT1:
7997   case OP_VEXT2:
7998   case OP_VEXT3:
7999     return DAG.getNode(ARMISD::VEXT, dl, VT,
8000                        OpLHS, OpRHS,
8001                        DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
8002   case OP_VUZPL:
8003   case OP_VUZPR:
8004     return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
8005                        OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
8006   case OP_VZIPL:
8007   case OP_VZIPR:
8008     return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
8009                        OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
8010   case OP_VTRNL:
8011   case OP_VTRNR:
8012     return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
8013                        OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
8014   }
8015 }
8016 
8017 static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
8018                                        ArrayRef<int> ShuffleMask,
8019                                        SelectionDAG &DAG) {
8020   // Check to see if we can use the VTBL instruction.
8021   SDValue V1 = Op.getOperand(0);
8022   SDValue V2 = Op.getOperand(1);
8023   SDLoc DL(Op);
8024 
8025   SmallVector<SDValue, 8> VTBLMask;
8026   for (ArrayRef<int>::iterator
8027          I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
8028     VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32));
8029 
8030   if (V2.getNode()->isUndef())
8031     return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
8032                        DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8033 
8034   return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
8035                      DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8036 }
8037 
8038 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
8039                                                       SelectionDAG &DAG) {
8040   SDLoc DL(Op);
8041   SDValue OpLHS = Op.getOperand(0);
8042   EVT VT = OpLHS.getValueType();
8043 
8044   assert((VT == MVT::v8i16 || VT == MVT::v16i8) &&
8045          "Expect an v8i16/v16i8 type");
8046   OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS);
8047   // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now,
8048   // extract the first 8 bytes into the top double word and the last 8 bytes
8049   // into the bottom double word. The v8i16 case is similar.
8050   unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4;
8051   return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS,
8052                      DAG.getConstant(ExtractNum, DL, MVT::i32));
8053 }
8054 
8055 static EVT getVectorTyFromPredicateVector(EVT VT) {
8056   switch (VT.getSimpleVT().SimpleTy) {
8057   case MVT::v4i1:
8058     return MVT::v4i32;
8059   case MVT::v8i1:
8060     return MVT::v8i16;
8061   case MVT::v16i1:
8062     return MVT::v16i8;
8063   default:
8064     llvm_unreachable("Unexpected vector predicate type");
8065   }
8066 }
8067 
8068 static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT,
8069                                     SelectionDAG &DAG) {
8070   // Converting from boolean predicates to integers involves creating a vector
8071   // of all ones or all zeroes and selecting the lanes based upon the real
8072   // predicate.
8073   SDValue AllOnes =
8074       DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32);
8075   AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes);
8076 
8077   SDValue AllZeroes =
8078       DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32);
8079   AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes);
8080 
8081   // Get full vector type from predicate type
8082   EVT NewVT = getVectorTyFromPredicateVector(VT);
8083 
8084   SDValue RecastV1;
8085   // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast
8086   // this to a v16i1. This cannot be done with an ordinary bitcast because the
8087   // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node,
8088   // since we know in hardware the sizes are really the same.
8089   if (VT != MVT::v16i1)
8090     RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred);
8091   else
8092     RecastV1 = Pred;
8093 
8094   // Select either all ones or zeroes depending upon the real predicate bits.
8095   SDValue PredAsVector =
8096       DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes);
8097 
8098   // Recast our new predicate-as-integer v16i8 vector into something
8099   // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate.
8100   return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector);
8101 }
8102 
8103 static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG,
8104                                       const ARMSubtarget *ST) {
8105   EVT VT = Op.getValueType();
8106   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
8107   ArrayRef<int> ShuffleMask = SVN->getMask();
8108 
8109   assert(ST->hasMVEIntegerOps() &&
8110          "No support for vector shuffle of boolean predicates");
8111 
8112   SDValue V1 = Op.getOperand(0);
8113   SDLoc dl(Op);
8114   if (isReverseMask(ShuffleMask, VT)) {
8115     SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1);
8116     SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast);
8117     SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit,
8118                               DAG.getConstant(16, dl, MVT::i32));
8119     return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl);
8120   }
8121 
8122   // Until we can come up with optimised cases for every single vector
8123   // shuffle in existence we have chosen the least painful strategy. This is
8124   // to essentially promote the boolean predicate to a 8-bit integer, where
8125   // each predicate represents a byte. Then we fall back on a normal integer
8126   // vector shuffle and convert the result back into a predicate vector. In
8127   // many cases the generated code might be even better than scalar code
8128   // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
8129   // fields in a register into 8 other arbitrary 2-bit fields!
8130   SDValue PredAsVector = PromoteMVEPredVector(dl, V1, VT, DAG);
8131   EVT NewVT = PredAsVector.getValueType();
8132 
8133   // Do the shuffle!
8134   SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector,
8135                                           DAG.getUNDEF(NewVT), ShuffleMask);
8136 
8137   // Now return the result of comparing the shuffled vector with zero,
8138   // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
8139   return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,
8140                      DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8141 }
8142 
8143 static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op,
8144                                             ArrayRef<int> ShuffleMask,
8145                                             SelectionDAG &DAG) {
8146   // Attempt to lower the vector shuffle using as many whole register movs as
8147   // possible. This is useful for types smaller than 32bits, which would
8148   // often otherwise become a series for grp movs.
8149   SDLoc dl(Op);
8150   EVT VT = Op.getValueType();
8151   if (VT.getScalarSizeInBits() >= 32)
8152     return SDValue();
8153 
8154   assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8155          "Unexpected vector type");
8156   int NumElts = VT.getVectorNumElements();
8157   int QuarterSize = NumElts / 4;
8158   // The four final parts of the vector, as i32's
8159   SDValue Parts[4];
8160 
8161   // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not
8162   // <u,u,u,u>), returning the vmov lane index
8163   auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) {
8164     // Detect which mov lane this would be from the first non-undef element.
8165     int MovIdx = -1;
8166     for (int i = 0; i < Length; i++) {
8167       if (ShuffleMask[Start + i] >= 0) {
8168         if (ShuffleMask[Start + i] % Length != i)
8169           return -1;
8170         MovIdx = ShuffleMask[Start + i] / Length;
8171         break;
8172       }
8173     }
8174     // If all items are undef, leave this for other combines
8175     if (MovIdx == -1)
8176       return -1;
8177     // Check the remaining values are the correct part of the same mov
8178     for (int i = 1; i < Length; i++) {
8179       if (ShuffleMask[Start + i] >= 0 &&
8180           (ShuffleMask[Start + i] / Length != MovIdx ||
8181            ShuffleMask[Start + i] % Length != i))
8182         return -1;
8183     }
8184     return MovIdx;
8185   };
8186 
8187   for (int Part = 0; Part < 4; ++Part) {
8188     // Does this part look like a mov
8189     int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize);
8190     if (Elt != -1) {
8191       SDValue Input = Op->getOperand(0);
8192       if (Elt >= 4) {
8193         Input = Op->getOperand(1);
8194         Elt -= 4;
8195       }
8196       SDValue BitCast = DAG.getBitcast(MVT::v4i32, Input);
8197       Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, BitCast,
8198                                 DAG.getConstant(Elt, dl, MVT::i32));
8199     }
8200   }
8201 
8202   // Nothing interesting found, just return
8203   if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3])
8204     return SDValue();
8205 
8206   // The other parts need to be built with the old shuffle vector, cast to a
8207   // v4i32 and extract_vector_elts
8208   if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) {
8209     SmallVector<int, 16> NewShuffleMask;
8210     for (int Part = 0; Part < 4; ++Part)
8211       for (int i = 0; i < QuarterSize; i++)
8212         NewShuffleMask.push_back(
8213             Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);
8214     SDValue NewShuffle = DAG.getVectorShuffle(
8215         VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask);
8216     SDValue BitCast = DAG.getBitcast(MVT::v4i32, NewShuffle);
8217 
8218     for (int Part = 0; Part < 4; ++Part)
8219       if (!Parts[Part])
8220         Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
8221                                   BitCast, DAG.getConstant(Part, dl, MVT::i32));
8222   }
8223   // Build a vector out of the various parts and bitcast it back to the original
8224   // type.
8225   SDValue NewVec = DAG.getBuildVector(MVT::v4i32, dl, Parts);
8226   return DAG.getBitcast(VT, NewVec);
8227 }
8228 
8229 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
8230                                    const ARMSubtarget *ST) {
8231   SDValue V1 = Op.getOperand(0);
8232   SDValue V2 = Op.getOperand(1);
8233   SDLoc dl(Op);
8234   EVT VT = Op.getValueType();
8235   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
8236   unsigned EltSize = VT.getScalarSizeInBits();
8237 
8238   if (ST->hasMVEIntegerOps() && EltSize == 1)
8239     return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST);
8240 
8241   // Convert shuffles that are directly supported on NEON to target-specific
8242   // DAG nodes, instead of keeping them as shuffles and matching them again
8243   // during code selection.  This is more efficient and avoids the possibility
8244   // of inconsistencies between legalization and selection.
8245   // FIXME: floating-point vectors should be canonicalized to integer vectors
8246   // of the same time so that they get CSEd properly.
8247   ArrayRef<int> ShuffleMask = SVN->getMask();
8248 
8249   if (EltSize <= 32) {
8250     if (SVN->isSplat()) {
8251       int Lane = SVN->getSplatIndex();
8252       // If this is undef splat, generate it via "just" vdup, if possible.
8253       if (Lane == -1) Lane = 0;
8254 
8255       // Test if V1 is a SCALAR_TO_VECTOR.
8256       if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8257         return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8258       }
8259       // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
8260       // (and probably will turn into a SCALAR_TO_VECTOR once legalization
8261       // reaches it).
8262       if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
8263           !isa<ConstantSDNode>(V1.getOperand(0))) {
8264         bool IsScalarToVector = true;
8265         for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
8266           if (!V1.getOperand(i).isUndef()) {
8267             IsScalarToVector = false;
8268             break;
8269           }
8270         if (IsScalarToVector)
8271           return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8272       }
8273       return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
8274                          DAG.getConstant(Lane, dl, MVT::i32));
8275     }
8276 
8277     bool ReverseVEXT = false;
8278     unsigned Imm = 0;
8279     if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
8280       if (ReverseVEXT)
8281         std::swap(V1, V2);
8282       return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
8283                          DAG.getConstant(Imm, dl, MVT::i32));
8284     }
8285 
8286     if (isVREVMask(ShuffleMask, VT, 64))
8287       return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
8288     if (isVREVMask(ShuffleMask, VT, 32))
8289       return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
8290     if (isVREVMask(ShuffleMask, VT, 16))
8291       return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
8292 
8293     if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
8294       return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
8295                          DAG.getConstant(Imm, dl, MVT::i32));
8296     }
8297 
8298     // Check for Neon shuffles that modify both input vectors in place.
8299     // If both results are used, i.e., if there are two shuffles with the same
8300     // source operands and with masks corresponding to both results of one of
8301     // these operations, DAG memoization will ensure that a single node is
8302     // used for both shuffles.
8303     unsigned WhichResult = 0;
8304     bool isV_UNDEF = false;
8305     if (ST->hasNEON()) {
8306       if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8307               ShuffleMask, VT, WhichResult, isV_UNDEF)) {
8308         if (isV_UNDEF)
8309           V2 = V1;
8310         return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
8311             .getValue(WhichResult);
8312       }
8313     }
8314     if (ST->hasMVEIntegerOps()) {
8315       if (isVMOVNMask(ShuffleMask, VT, 0))
8316         return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1,
8317                            DAG.getConstant(0, dl, MVT::i32));
8318       if (isVMOVNMask(ShuffleMask, VT, 1))
8319         return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2,
8320                            DAG.getConstant(1, dl, MVT::i32));
8321     }
8322 
8323     // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
8324     // shuffles that produce a result larger than their operands with:
8325     //   shuffle(concat(v1, undef), concat(v2, undef))
8326     // ->
8327     //   shuffle(concat(v1, v2), undef)
8328     // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
8329     //
8330     // This is useful in the general case, but there are special cases where
8331     // native shuffles produce larger results: the two-result ops.
8332     //
8333     // Look through the concat when lowering them:
8334     //   shuffle(concat(v1, v2), undef)
8335     // ->
8336     //   concat(VZIP(v1, v2):0, :1)
8337     //
8338     if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
8339       SDValue SubV1 = V1->getOperand(0);
8340       SDValue SubV2 = V1->getOperand(1);
8341       EVT SubVT = SubV1.getValueType();
8342 
8343       // We expect these to have been canonicalized to -1.
8344       assert(llvm::all_of(ShuffleMask, [&](int i) {
8345         return i < (int)VT.getVectorNumElements();
8346       }) && "Unexpected shuffle index into UNDEF operand!");
8347 
8348       if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8349               ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
8350         if (isV_UNDEF)
8351           SubV2 = SubV1;
8352         assert((WhichResult == 0) &&
8353                "In-place shuffle of concat can only have one result!");
8354         SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
8355                                   SubV1, SubV2);
8356         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
8357                            Res.getValue(1));
8358       }
8359     }
8360   }
8361 
8362   // If the shuffle is not directly supported and it has 4 elements, use
8363   // the PerfectShuffle-generated table to synthesize it from other shuffles.
8364   unsigned NumElts = VT.getVectorNumElements();
8365   if (NumElts == 4) {
8366     unsigned PFIndexes[4];
8367     for (unsigned i = 0; i != 4; ++i) {
8368       if (ShuffleMask[i] < 0)
8369         PFIndexes[i] = 8;
8370       else
8371         PFIndexes[i] = ShuffleMask[i];
8372     }
8373 
8374     // Compute the index in the perfect shuffle table.
8375     unsigned PFTableIndex =
8376       PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8377     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8378     unsigned Cost = (PFEntry >> 30);
8379 
8380     if (Cost <= 4) {
8381       if (ST->hasNEON())
8382         return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8383       else if (isLegalMVEShuffleOp(PFEntry)) {
8384         unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8385         unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
8386         unsigned PFEntryLHS = PerfectShuffleTable[LHSID];
8387         unsigned PFEntryRHS = PerfectShuffleTable[RHSID];
8388         if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS))
8389           return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8390       }
8391     }
8392   }
8393 
8394   // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
8395   if (EltSize >= 32) {
8396     // Do the expansion with floating-point types, since that is what the VFP
8397     // registers are defined to use, and since i64 is not legal.
8398     EVT EltVT = EVT::getFloatingPointVT(EltSize);
8399     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
8400     V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
8401     V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
8402     SmallVector<SDValue, 8> Ops;
8403     for (unsigned i = 0; i < NumElts; ++i) {
8404       if (ShuffleMask[i] < 0)
8405         Ops.push_back(DAG.getUNDEF(EltVT));
8406       else
8407         Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
8408                                   ShuffleMask[i] < (int)NumElts ? V1 : V2,
8409                                   DAG.getConstant(ShuffleMask[i] & (NumElts-1),
8410                                                   dl, MVT::i32)));
8411     }
8412     SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
8413     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8414   }
8415 
8416   if (ST->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
8417     return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);
8418 
8419   if (ST->hasNEON() && VT == MVT::v8i8)
8420     if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
8421       return NewOp;
8422 
8423   if (ST->hasMVEIntegerOps())
8424     if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG))
8425       return NewOp;
8426 
8427   return SDValue();
8428 }
8429 
8430 static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG,
8431                                          const ARMSubtarget *ST) {
8432   EVT VecVT = Op.getOperand(0).getValueType();
8433   SDLoc dl(Op);
8434 
8435   assert(ST->hasMVEIntegerOps() &&
8436          "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8437 
8438   SDValue Conv =
8439       DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8440   unsigned Lane = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
8441   unsigned LaneWidth =
8442       getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8;
8443   unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;
8444   SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32,
8445                             Op.getOperand(1), DAG.getValueType(MVT::i1));
8446   SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext,
8447                             DAG.getConstant(~Mask, dl, MVT::i32));
8448   return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI);
8449 }
8450 
8451 SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
8452                                                   SelectionDAG &DAG) const {
8453   // INSERT_VECTOR_ELT is legal only for immediate indexes.
8454   SDValue Lane = Op.getOperand(2);
8455   if (!isa<ConstantSDNode>(Lane))
8456     return SDValue();
8457 
8458   SDValue Elt = Op.getOperand(1);
8459   EVT EltVT = Elt.getValueType();
8460 
8461   if (Subtarget->hasMVEIntegerOps() &&
8462       Op.getValueType().getScalarSizeInBits() == 1)
8463     return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget);
8464 
8465   if (getTypeAction(*DAG.getContext(), EltVT) ==
8466       TargetLowering::TypePromoteFloat) {
8467     // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
8468     // but the type system will try to do that if we don't intervene.
8469     // Reinterpret any such vector-element insertion as one with the
8470     // corresponding integer types.
8471 
8472     SDLoc dl(Op);
8473 
8474     EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits());
8475     assert(getTypeAction(*DAG.getContext(), IEltVT) !=
8476            TargetLowering::TypePromoteFloat);
8477 
8478     SDValue VecIn = Op.getOperand(0);
8479     EVT VecVT = VecIn.getValueType();
8480     EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT,
8481                                   VecVT.getVectorNumElements());
8482 
8483     SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt);
8484     SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn);
8485     SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT,
8486                                   IVecIn, IElt, Lane);
8487     return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut);
8488   }
8489 
8490   return Op;
8491 }
8492 
8493 static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG,
8494                                           const ARMSubtarget *ST) {
8495   EVT VecVT = Op.getOperand(0).getValueType();
8496   SDLoc dl(Op);
8497 
8498   assert(ST->hasMVEIntegerOps() &&
8499          "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8500 
8501   SDValue Conv =
8502       DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8503   unsigned Lane = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
8504   unsigned LaneWidth =
8505       getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8;
8506   SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,
8507                               DAG.getConstant(Lane * LaneWidth, dl, MVT::i32));
8508   return Shift;
8509 }
8510 
8511 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG,
8512                                        const ARMSubtarget *ST) {
8513   // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
8514   SDValue Lane = Op.getOperand(1);
8515   if (!isa<ConstantSDNode>(Lane))
8516     return SDValue();
8517 
8518   SDValue Vec = Op.getOperand(0);
8519   EVT VT = Vec.getValueType();
8520 
8521   if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
8522     return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST);
8523 
8524   if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
8525     SDLoc dl(Op);
8526     return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
8527   }
8528 
8529   return Op;
8530 }
8531 
8532 static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG,
8533                                       const ARMSubtarget *ST) {
8534   SDValue V1 = Op.getOperand(0);
8535   SDValue V2 = Op.getOperand(1);
8536   SDLoc dl(Op);
8537   EVT VT = Op.getValueType();
8538   EVT Op1VT = V1.getValueType();
8539   EVT Op2VT = V2.getValueType();
8540   unsigned NumElts = VT.getVectorNumElements();
8541 
8542   assert(Op1VT == Op2VT && "Operand types don't match!");
8543   assert(VT.getScalarSizeInBits() == 1 &&
8544          "Unexpected custom CONCAT_VECTORS lowering");
8545   assert(ST->hasMVEIntegerOps() &&
8546          "CONCAT_VECTORS lowering only supported for MVE");
8547 
8548   SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
8549   SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
8550 
8551   // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
8552   // promoted to v8i16, etc.
8553 
8554   MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
8555 
8556   // Extract the vector elements from Op1 and Op2 one by one and truncate them
8557   // to be the right size for the destination. For example, if Op1 is v4i1 then
8558   // the promoted vector is v4i32. The result of concatentation gives a v8i1,
8559   // which when promoted is v8i16. That means each i32 element from Op1 needs
8560   // truncating to i16 and inserting in the result.
8561   EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
8562   SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
8563   auto ExractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
8564     EVT NewVT = NewV.getValueType();
8565     EVT ConcatVT = ConVec.getValueType();
8566     for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
8567       SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
8568                                 DAG.getIntPtrConstant(i, dl));
8569       ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
8570                            DAG.getConstant(j, dl, MVT::i32));
8571     }
8572     return ConVec;
8573   };
8574   unsigned j = 0;
8575   ConVec = ExractInto(NewV1, ConVec, j);
8576   ConVec = ExractInto(NewV2, ConVec, j);
8577 
8578   // Now return the result of comparing the subvector with zero,
8579   // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
8580   return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
8581                      DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8582 }
8583 
8584 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
8585                                    const ARMSubtarget *ST) {
8586   EVT VT = Op->getValueType(0);
8587   if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
8588     return LowerCONCAT_VECTORS_i1(Op, DAG, ST);
8589 
8590   // The only time a CONCAT_VECTORS operation can have legal types is when
8591   // two 64-bit vectors are concatenated to a 128-bit vector.
8592   assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
8593          "unexpected CONCAT_VECTORS");
8594   SDLoc dl(Op);
8595   SDValue Val = DAG.getUNDEF(MVT::v2f64);
8596   SDValue Op0 = Op.getOperand(0);
8597   SDValue Op1 = Op.getOperand(1);
8598   if (!Op0.isUndef())
8599     Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
8600                       DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
8601                       DAG.getIntPtrConstant(0, dl));
8602   if (!Op1.isUndef())
8603     Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
8604                       DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
8605                       DAG.getIntPtrConstant(1, dl));
8606   return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
8607 }
8608 
8609 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG,
8610                                       const ARMSubtarget *ST) {
8611   SDValue V1 = Op.getOperand(0);
8612   SDValue V2 = Op.getOperand(1);
8613   SDLoc dl(Op);
8614   EVT VT = Op.getValueType();
8615   EVT Op1VT = V1.getValueType();
8616   unsigned NumElts = VT.getVectorNumElements();
8617   unsigned Index = cast<ConstantSDNode>(V2)->getZExtValue();
8618 
8619   assert(VT.getScalarSizeInBits() == 1 &&
8620          "Unexpected custom EXTRACT_SUBVECTOR lowering");
8621   assert(ST->hasMVEIntegerOps() &&
8622          "EXTRACT_SUBVECTOR lowering only supported for MVE");
8623 
8624   SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
8625 
8626   // We now have Op1 promoted to a vector of integers, where v8i1 gets
8627   // promoted to v8i16, etc.
8628 
8629   MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
8630 
8631   EVT SubVT = MVT::getVectorVT(ElType, NumElts);
8632   SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
8633   for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {
8634     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
8635                               DAG.getIntPtrConstant(i, dl));
8636     SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
8637                          DAG.getConstant(j, dl, MVT::i32));
8638   }
8639 
8640   // Now return the result of comparing the subvector with zero,
8641   // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
8642   return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec,
8643                      DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8644 }
8645 
8646 // Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0).
8647 static SDValue LowerTruncatei1(SDValue N, SelectionDAG &DAG,
8648                                const ARMSubtarget *ST) {
8649   assert(ST->hasMVEIntegerOps() && "Expected MVE!");
8650   EVT VT = N.getValueType();
8651   assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) &&
8652          "Expected a vector i1 type!");
8653   SDValue Op = N.getOperand(0);
8654   EVT FromVT = Op.getValueType();
8655   SDLoc DL(N);
8656 
8657   SDValue And =
8658       DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT));
8659   return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT),
8660                      DAG.getCondCode(ISD::SETNE));
8661 }
8662 
8663 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
8664 /// element has been zero/sign-extended, depending on the isSigned parameter,
8665 /// from an integer type half its size.
8666 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
8667                                    bool isSigned) {
8668   // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
8669   EVT VT = N->getValueType(0);
8670   if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
8671     SDNode *BVN = N->getOperand(0).getNode();
8672     if (BVN->getValueType(0) != MVT::v4i32 ||
8673         BVN->getOpcode() != ISD::BUILD_VECTOR)
8674       return false;
8675     unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
8676     unsigned HiElt = 1 - LoElt;
8677     ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));
8678     ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));
8679     ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2));
8680     ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2));
8681     if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
8682       return false;
8683     if (isSigned) {
8684       if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
8685           Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
8686         return true;
8687     } else {
8688       if (Hi0->isNullValue() && Hi1->isNullValue())
8689         return true;
8690     }
8691     return false;
8692   }
8693 
8694   if (N->getOpcode() != ISD::BUILD_VECTOR)
8695     return false;
8696 
8697   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
8698     SDNode *Elt = N->getOperand(i).getNode();
8699     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
8700       unsigned EltSize = VT.getScalarSizeInBits();
8701       unsigned HalfSize = EltSize / 2;
8702       if (isSigned) {
8703         if (!isIntN(HalfSize, C->getSExtValue()))
8704           return false;
8705       } else {
8706         if (!isUIntN(HalfSize, C->getZExtValue()))
8707           return false;
8708       }
8709       continue;
8710     }
8711     return false;
8712   }
8713 
8714   return true;
8715 }
8716 
8717 /// isSignExtended - Check if a node is a vector value that is sign-extended
8718 /// or a constant BUILD_VECTOR with sign-extended elements.
8719 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
8720   if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
8721     return true;
8722   if (isExtendedBUILD_VECTOR(N, DAG, true))
8723     return true;
8724   return false;
8725 }
8726 
8727 /// isZeroExtended - Check if a node is a vector value that is zero-extended (or
8728 /// any-extended) or a constant BUILD_VECTOR with zero-extended elements.
8729 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
8730   if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND ||
8731       ISD::isZEXTLoad(N))
8732     return true;
8733   if (isExtendedBUILD_VECTOR(N, DAG, false))
8734     return true;
8735   return false;
8736 }
8737 
8738 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
8739   if (OrigVT.getSizeInBits() >= 64)
8740     return OrigVT;
8741 
8742   assert(OrigVT.isSimple() && "Expecting a simple value type");
8743 
8744   MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
8745   switch (OrigSimpleTy) {
8746   default: llvm_unreachable("Unexpected Vector Type");
8747   case MVT::v2i8:
8748   case MVT::v2i16:
8749      return MVT::v2i32;
8750   case MVT::v4i8:
8751     return  MVT::v4i16;
8752   }
8753 }
8754 
8755 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
8756 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
8757 /// We insert the required extension here to get the vector to fill a D register.
8758 static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
8759                                             const EVT &OrigTy,
8760                                             const EVT &ExtTy,
8761                                             unsigned ExtOpcode) {
8762   // The vector originally had a size of OrigTy. It was then extended to ExtTy.
8763   // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
8764   // 64-bits we need to insert a new extension so that it will be 64-bits.
8765   assert(ExtTy.is128BitVector() && "Unexpected extension size");
8766   if (OrigTy.getSizeInBits() >= 64)
8767     return N;
8768 
8769   // Must extend size to at least 64 bits to be used as an operand for VMULL.
8770   EVT NewVT = getExtensionTo64Bits(OrigTy);
8771 
8772   return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
8773 }
8774 
8775 /// SkipLoadExtensionForVMULL - return a load of the original vector size that
8776 /// does not do any sign/zero extension. If the original vector is less
8777 /// than 64 bits, an appropriate extension will be added after the load to
8778 /// reach a total size of 64 bits. We have to add the extension separately
8779 /// because ARM does not have a sign/zero extending load for vectors.
8780 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
8781   EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
8782 
8783   // The load already has the right type.
8784   if (ExtendedTy == LD->getMemoryVT())
8785     return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
8786                        LD->getBasePtr(), LD->getPointerInfo(),
8787                        LD->getAlignment(), LD->getMemOperand()->getFlags());
8788 
8789   // We need to create a zextload/sextload. We cannot just create a load
8790   // followed by a zext/zext node because LowerMUL is also run during normal
8791   // operation legalization where we can't create illegal types.
8792   return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
8793                         LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
8794                         LD->getMemoryVT(), LD->getAlignment(),
8795                         LD->getMemOperand()->getFlags());
8796 }
8797 
8798 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
8799 /// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return
8800 /// the unextended value. The unextended vector should be 64 bits so that it can
8801 /// be used as an operand to a VMULL instruction. If the original vector size
8802 /// before extension is less than 64 bits we add a an extension to resize
8803 /// the vector to 64 bits.
8804 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
8805   if (N->getOpcode() == ISD::SIGN_EXTEND ||
8806       N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
8807     return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
8808                                         N->getOperand(0)->getValueType(0),
8809                                         N->getValueType(0),
8810                                         N->getOpcode());
8811 
8812   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
8813     assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
8814            "Expected extending load");
8815 
8816     SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
8817     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
8818     unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
8819     SDValue extLoad =
8820         DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
8821     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);
8822 
8823     return newLoad;
8824   }
8825 
8826   // Otherwise, the value must be a BUILD_VECTOR.  For v2i64, it will
8827   // have been legalized as a BITCAST from v4i32.
8828   if (N->getOpcode() == ISD::BITCAST) {
8829     SDNode *BVN = N->getOperand(0).getNode();
8830     assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
8831            BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
8832     unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
8833     return DAG.getBuildVector(
8834         MVT::v2i32, SDLoc(N),
8835         {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
8836   }
8837   // Construct a new BUILD_VECTOR with elements truncated to half the size.
8838   assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
8839   EVT VT = N->getValueType(0);
8840   unsigned EltSize = VT.getScalarSizeInBits() / 2;
8841   unsigned NumElts = VT.getVectorNumElements();
8842   MVT TruncVT = MVT::getIntegerVT(EltSize);
8843   SmallVector<SDValue, 8> Ops;
8844   SDLoc dl(N);
8845   for (unsigned i = 0; i != NumElts; ++i) {
8846     ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
8847     const APInt &CInt = C->getAPIntValue();
8848     // Element types smaller than 32 bits are not legal, so use i32 elements.
8849     // The values are implicitly truncated so sext vs. zext doesn't matter.
8850     Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
8851   }
8852   return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
8853 }
8854 
8855 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
8856   unsigned Opcode = N->getOpcode();
8857   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
8858     SDNode *N0 = N->getOperand(0).getNode();
8859     SDNode *N1 = N->getOperand(1).getNode();
8860     return N0->hasOneUse() && N1->hasOneUse() &&
8861       isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
8862   }
8863   return false;
8864 }
8865 
8866 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
8867   unsigned Opcode = N->getOpcode();
8868   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
8869     SDNode *N0 = N->getOperand(0).getNode();
8870     SDNode *N1 = N->getOperand(1).getNode();
8871     return N0->hasOneUse() && N1->hasOneUse() &&
8872       isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
8873   }
8874   return false;
8875 }
8876 
8877 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
8878   // Multiplications are only custom-lowered for 128-bit vectors so that
8879   // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
8880   EVT VT = Op.getValueType();
8881   assert(VT.is128BitVector() && VT.isInteger() &&
8882          "unexpected type for custom-lowering ISD::MUL");
8883   SDNode *N0 = Op.getOperand(0).getNode();
8884   SDNode *N1 = Op.getOperand(1).getNode();
8885   unsigned NewOpc = 0;
8886   bool isMLA = false;
8887   bool isN0SExt = isSignExtended(N0, DAG);
8888   bool isN1SExt = isSignExtended(N1, DAG);
8889   if (isN0SExt && isN1SExt)
8890     NewOpc = ARMISD::VMULLs;
8891   else {
8892     bool isN0ZExt = isZeroExtended(N0, DAG);
8893     bool isN1ZExt = isZeroExtended(N1, DAG);
8894     if (isN0ZExt && isN1ZExt)
8895       NewOpc = ARMISD::VMULLu;
8896     else if (isN1SExt || isN1ZExt) {
8897       // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
8898       // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
8899       if (isN1SExt && isAddSubSExt(N0, DAG)) {
8900         NewOpc = ARMISD::VMULLs;
8901         isMLA = true;
8902       } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
8903         NewOpc = ARMISD::VMULLu;
8904         isMLA = true;
8905       } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
8906         std::swap(N0, N1);
8907         NewOpc = ARMISD::VMULLu;
8908         isMLA = true;
8909       }
8910     }
8911 
8912     if (!NewOpc) {
8913       if (VT == MVT::v2i64)
8914         // Fall through to expand this.  It is not legal.
8915         return SDValue();
8916       else
8917         // Other vector multiplications are legal.
8918         return Op;
8919     }
8920   }
8921 
8922   // Legalize to a VMULL instruction.
8923   SDLoc DL(Op);
8924   SDValue Op0;
8925   SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
8926   if (!isMLA) {
8927     Op0 = SkipExtensionForVMULL(N0, DAG);
8928     assert(Op0.getValueType().is64BitVector() &&
8929            Op1.getValueType().is64BitVector() &&
8930            "unexpected types for extended operands to VMULL");
8931     return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
8932   }
8933 
8934   // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
8935   // isel lowering to take advantage of no-stall back to back vmul + vmla.
8936   //   vmull q0, d4, d6
8937   //   vmlal q0, d5, d6
8938   // is faster than
8939   //   vaddl q0, d4, d5
8940   //   vmovl q1, d6
8941   //   vmul  q0, q0, q1
8942   SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
8943   SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
8944   EVT Op1VT = Op1.getValueType();
8945   return DAG.getNode(N0->getOpcode(), DL, VT,
8946                      DAG.getNode(NewOpc, DL, VT,
8947                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
8948                      DAG.getNode(NewOpc, DL, VT,
8949                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
8950 }
8951 
8952 static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl,
8953                               SelectionDAG &DAG) {
8954   // TODO: Should this propagate fast-math-flags?
8955 
8956   // Convert to float
8957   // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
8958   // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
8959   X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
8960   Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
8961   X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
8962   Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
8963   // Get reciprocal estimate.
8964   // float4 recip = vrecpeq_f32(yf);
8965   Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
8966                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
8967                    Y);
8968   // Because char has a smaller range than uchar, we can actually get away
8969   // without any newton steps.  This requires that we use a weird bias
8970   // of 0xb000, however (again, this has been exhaustively tested).
8971   // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
8972   X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
8973   X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
8974   Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
8975   X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
8976   X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
8977   // Convert back to short.
8978   X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
8979   X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
8980   return X;
8981 }
8982 
8983 static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl,
8984                                SelectionDAG &DAG) {
8985   // TODO: Should this propagate fast-math-flags?
8986 
8987   SDValue N2;
8988   // Convert to float.
8989   // float4 yf = vcvt_f32_s32(vmovl_s16(y));
8990   // float4 xf = vcvt_f32_s32(vmovl_s16(x));
8991   N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
8992   N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
8993   N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
8994   N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
8995 
8996   // Use reciprocal estimate and one refinement step.
8997   // float4 recip = vrecpeq_f32(yf);
8998   // recip *= vrecpsq_f32(yf, recip);
8999   N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9000                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9001                    N1);
9002   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9003                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9004                    N1, N2);
9005   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9006   // Because short has a smaller range than ushort, we can actually get away
9007   // with only a single newton step.  This requires that we use a weird bias
9008   // of 89, however (again, this has been exhaustively tested).
9009   // float4 result = as_float4(as_int4(xf*recip) + 0x89);
9010   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9011   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9012   N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
9013   N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9014   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9015   // Convert back to integer and return.
9016   // return vmovn_s32(vcvt_s32_f32(result));
9017   N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9018   N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9019   return N0;
9020 }
9021 
9022 static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG,
9023                          const ARMSubtarget *ST) {
9024   EVT VT = Op.getValueType();
9025   assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9026          "unexpected type for custom-lowering ISD::SDIV");
9027 
9028   SDLoc dl(Op);
9029   SDValue N0 = Op.getOperand(0);
9030   SDValue N1 = Op.getOperand(1);
9031   SDValue N2, N3;
9032 
9033   if (VT == MVT::v8i8) {
9034     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
9035     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
9036 
9037     N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9038                      DAG.getIntPtrConstant(4, dl));
9039     N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9040                      DAG.getIntPtrConstant(4, dl));
9041     N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9042                      DAG.getIntPtrConstant(0, dl));
9043     N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9044                      DAG.getIntPtrConstant(0, dl));
9045 
9046     N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
9047     N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
9048 
9049     N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9050     N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9051 
9052     N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
9053     return N0;
9054   }
9055   return LowerSDIV_v4i16(N0, N1, dl, DAG);
9056 }
9057 
9058 static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG,
9059                          const ARMSubtarget *ST) {
9060   // TODO: Should this propagate fast-math-flags?
9061   EVT VT = Op.getValueType();
9062   assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9063          "unexpected type for custom-lowering ISD::UDIV");
9064 
9065   SDLoc dl(Op);
9066   SDValue N0 = Op.getOperand(0);
9067   SDValue N1 = Op.getOperand(1);
9068   SDValue N2, N3;
9069 
9070   if (VT == MVT::v8i8) {
9071     N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
9072     N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
9073 
9074     N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9075                      DAG.getIntPtrConstant(4, dl));
9076     N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9077                      DAG.getIntPtrConstant(4, dl));
9078     N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9079                      DAG.getIntPtrConstant(0, dl));
9080     N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9081                      DAG.getIntPtrConstant(0, dl));
9082 
9083     N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
9084     N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
9085 
9086     N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9087     N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9088 
9089     N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
9090                      DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
9091                                      MVT::i32),
9092                      N0);
9093     return N0;
9094   }
9095 
9096   // v4i16 sdiv ... Convert to float.
9097   // float4 yf = vcvt_f32_s32(vmovl_u16(y));
9098   // float4 xf = vcvt_f32_s32(vmovl_u16(x));
9099   N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
9100   N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
9101   N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9102   SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9103 
9104   // Use reciprocal estimate and two refinement steps.
9105   // float4 recip = vrecpeq_f32(yf);
9106   // recip *= vrecpsq_f32(yf, recip);
9107   // recip *= vrecpsq_f32(yf, recip);
9108   N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9109                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9110                    BN1);
9111   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9112                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9113                    BN1, N2);
9114   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9115   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9116                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9117                    BN1, N2);
9118   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9119   // Simply multiplying by the reciprocal estimate can leave us a few ulps
9120   // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
9121   // and that it will never cause us to return an answer too large).
9122   // float4 result = as_float4(as_int4(xf*recip) + 2);
9123   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9124   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9125   N1 = DAG.getConstant(2, dl, MVT::v4i32);
9126   N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9127   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9128   // Convert back to integer and return.
9129   // return vmovn_u32(vcvt_s32_f32(result));
9130   N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9131   N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9132   return N0;
9133 }
9134 
9135 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
9136   SDNode *N = Op.getNode();
9137   EVT VT = N->getValueType(0);
9138   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
9139 
9140   SDValue Carry = Op.getOperand(2);
9141 
9142   SDLoc DL(Op);
9143 
9144   SDValue Result;
9145   if (Op.getOpcode() == ISD::ADDCARRY) {
9146     // This converts the boolean value carry into the carry flag.
9147     Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9148 
9149     // Do the addition proper using the carry flag we wanted.
9150     Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0),
9151                          Op.getOperand(1), Carry);
9152 
9153     // Now convert the carry flag into a boolean value.
9154     Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9155   } else {
9156     // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we
9157     // have to invert the carry first.
9158     Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9159                         DAG.getConstant(1, DL, MVT::i32), Carry);
9160     // This converts the boolean value carry into the carry flag.
9161     Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9162 
9163     // Do the subtraction proper using the carry flag we wanted.
9164     Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0),
9165                          Op.getOperand(1), Carry);
9166 
9167     // Now convert the carry flag into a boolean value.
9168     Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9169     // But the carry returned by ARMISD::SUBE is not a borrow as expected
9170     // by ISD::SUBCARRY, so compute 1 - C.
9171     Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9172                         DAG.getConstant(1, DL, MVT::i32), Carry);
9173   }
9174 
9175   // Return both values.
9176   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
9177 }
9178 
9179 SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
9180   assert(Subtarget->isTargetDarwin());
9181 
9182   // For iOS, we want to call an alternative entry point: __sincos_stret,
9183   // return values are passed via sret.
9184   SDLoc dl(Op);
9185   SDValue Arg = Op.getOperand(0);
9186   EVT ArgVT = Arg.getValueType();
9187   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
9188   auto PtrVT = getPointerTy(DAG.getDataLayout());
9189 
9190   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
9191   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9192 
9193   // Pair of floats / doubles used to pass the result.
9194   Type *RetTy = StructType::get(ArgTy, ArgTy);
9195   auto &DL = DAG.getDataLayout();
9196 
9197   ArgListTy Args;
9198   bool ShouldUseSRet = Subtarget->isAPCS_ABI();
9199   SDValue SRet;
9200   if (ShouldUseSRet) {
9201     // Create stack object for sret.
9202     const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
9203     const Align StackAlign = DL.getPrefTypeAlign(RetTy);
9204     int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
9205     SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL));
9206 
9207     ArgListEntry Entry;
9208     Entry.Node = SRet;
9209     Entry.Ty = RetTy->getPointerTo();
9210     Entry.IsSExt = false;
9211     Entry.IsZExt = false;
9212     Entry.IsSRet = true;
9213     Args.push_back(Entry);
9214     RetTy = Type::getVoidTy(*DAG.getContext());
9215   }
9216 
9217   ArgListEntry Entry;
9218   Entry.Node = Arg;
9219   Entry.Ty = ArgTy;
9220   Entry.IsSExt = false;
9221   Entry.IsZExt = false;
9222   Args.push_back(Entry);
9223 
9224   RTLIB::Libcall LC =
9225       (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
9226   const char *LibcallName = getLibcallName(LC);
9227   CallingConv::ID CC = getLibcallCallingConv(LC);
9228   SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
9229 
9230   TargetLowering::CallLoweringInfo CLI(DAG);
9231   CLI.setDebugLoc(dl)
9232       .setChain(DAG.getEntryNode())
9233       .setCallee(CC, RetTy, Callee, std::move(Args))
9234       .setDiscardResult(ShouldUseSRet);
9235   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
9236 
9237   if (!ShouldUseSRet)
9238     return CallResult.first;
9239 
9240   SDValue LoadSin =
9241       DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
9242 
9243   // Address of cos field.
9244   SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
9245                             DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
9246   SDValue LoadCos =
9247       DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
9248 
9249   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
9250   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
9251                      LoadSin.getValue(0), LoadCos.getValue(0));
9252 }
9253 
9254 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
9255                                                   bool Signed,
9256                                                   SDValue &Chain) const {
9257   EVT VT = Op.getValueType();
9258   assert((VT == MVT::i32 || VT == MVT::i64) &&
9259          "unexpected type for custom lowering DIV");
9260   SDLoc dl(Op);
9261 
9262   const auto &DL = DAG.getDataLayout();
9263   const auto &TLI = DAG.getTargetLoweringInfo();
9264 
9265   const char *Name = nullptr;
9266   if (Signed)
9267     Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64";
9268   else
9269     Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64";
9270 
9271   SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL));
9272 
9273   ARMTargetLowering::ArgListTy Args;
9274 
9275   for (auto AI : {1, 0}) {
9276     ArgListEntry Arg;
9277     Arg.Node = Op.getOperand(AI);
9278     Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext());
9279     Args.push_back(Arg);
9280   }
9281 
9282   CallLoweringInfo CLI(DAG);
9283   CLI.setDebugLoc(dl)
9284     .setChain(Chain)
9285     .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()),
9286                ES, std::move(Args));
9287 
9288   return LowerCallTo(CLI).first;
9289 }
9290 
9291 // This is a code size optimisation: return the original SDIV node to
9292 // DAGCombiner when we don't want to expand SDIV into a sequence of
9293 // instructions, and an empty node otherwise which will cause the
9294 // SDIV to be expanded in DAGCombine.
9295 SDValue
9296 ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
9297                                  SelectionDAG &DAG,
9298                                  SmallVectorImpl<SDNode *> &Created) const {
9299   // TODO: Support SREM
9300   if (N->getOpcode() != ISD::SDIV)
9301     return SDValue();
9302 
9303   const auto &ST = static_cast<const ARMSubtarget&>(DAG.getSubtarget());
9304   const bool MinSize = ST.hasMinSize();
9305   const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
9306                                       : ST.hasDivideInARMMode();
9307 
9308   // Don't touch vector types; rewriting this may lead to scalarizing
9309   // the int divs.
9310   if (N->getOperand(0).getValueType().isVector())
9311     return SDValue();
9312 
9313   // Bail if MinSize is not set, and also for both ARM and Thumb mode we need
9314   // hwdiv support for this to be really profitable.
9315   if (!(MinSize && HasDivide))
9316     return SDValue();
9317 
9318   // ARM mode is a bit simpler than Thumb: we can handle large power
9319   // of 2 immediates with 1 mov instruction; no further checks required,
9320   // just return the sdiv node.
9321   if (!ST.isThumb())
9322     return SDValue(N, 0);
9323 
9324   // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
9325   // and thus lose the code size benefits of a MOVS that requires only 2.
9326   // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
9327   // but as it's doing exactly this, it's not worth the trouble to get TTI.
9328   if (Divisor.sgt(128))
9329     return SDValue();
9330 
9331   return SDValue(N, 0);
9332 }
9333 
9334 SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
9335                                             bool Signed) const {
9336   assert(Op.getValueType() == MVT::i32 &&
9337          "unexpected type for custom lowering DIV");
9338   SDLoc dl(Op);
9339 
9340   SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
9341                                DAG.getEntryNode(), Op.getOperand(1));
9342 
9343   return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9344 }
9345 
9346 static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) {
9347   SDLoc DL(N);
9348   SDValue Op = N->getOperand(1);
9349   if (N->getValueType(0) == MVT::i32)
9350     return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
9351   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
9352                            DAG.getConstant(0, DL, MVT::i32));
9353   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
9354                            DAG.getConstant(1, DL, MVT::i32));
9355   return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
9356                      DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
9357 }
9358 
9359 void ARMTargetLowering::ExpandDIV_Windows(
9360     SDValue Op, SelectionDAG &DAG, bool Signed,
9361     SmallVectorImpl<SDValue> &Results) const {
9362   const auto &DL = DAG.getDataLayout();
9363   const auto &TLI = DAG.getTargetLoweringInfo();
9364 
9365   assert(Op.getValueType() == MVT::i64 &&
9366          "unexpected type for custom lowering DIV");
9367   SDLoc dl(Op);
9368 
9369   SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
9370 
9371   SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9372 
9373   SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
9374   SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
9375                               DAG.getConstant(32, dl, TLI.getPointerTy(DL)));
9376   Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
9377 
9378   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper));
9379 }
9380 
9381 static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) {
9382   LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
9383   EVT MemVT = LD->getMemoryVT();
9384   assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) &&
9385          "Expected a predicate type!");
9386   assert(MemVT == Op.getValueType());
9387   assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
9388          "Expected a non-extending load");
9389   assert(LD->isUnindexed() && "Expected a unindexed load");
9390 
9391   // The basic MVE VLDR on a v4i1/v8i1 actually loads the entire 16bit
9392   // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
9393   // need to make sure that 8/4 bits are actually loaded into the correct
9394   // place, which means loading the value and then shuffling the values into
9395   // the bottom bits of the predicate.
9396   // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
9397   // for BE).
9398 
9399   SDLoc dl(Op);
9400   SDValue Load = DAG.getExtLoad(
9401       ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
9402       EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
9403       LD->getMemOperand());
9404   SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Load);
9405   if (MemVT != MVT::v16i1)
9406     Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
9407                        DAG.getConstant(0, dl, MVT::i32));
9408   return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
9409 }
9410 
9411 void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
9412                                   SelectionDAG &DAG) const {
9413   LoadSDNode *LD = cast<LoadSDNode>(N);
9414   EVT MemVT = LD->getMemoryVT();
9415   assert(LD->isUnindexed() && "Loads should be unindexed at this point.");
9416 
9417   if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
9418       !Subtarget->isThumb1Only() && LD->isVolatile()) {
9419     SDLoc dl(N);
9420     SDValue Result = DAG.getMemIntrinsicNode(
9421         ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),
9422         {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand());
9423     SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1);
9424     SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0);
9425     SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
9426     Results.append({Pair, Result.getValue(2)});
9427   }
9428 }
9429 
9430 static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) {
9431   StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
9432   EVT MemVT = ST->getMemoryVT();
9433   assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) &&
9434          "Expected a predicate type!");
9435   assert(MemVT == ST->getValue().getValueType());
9436   assert(!ST->isTruncatingStore() && "Expected a non-extending store");
9437   assert(ST->isUnindexed() && "Expected a unindexed store");
9438 
9439   // Only store the v4i1 or v8i1 worth of bits, via a buildvector with top bits
9440   // unset and a scalar store.
9441   SDLoc dl(Op);
9442   SDValue Build = ST->getValue();
9443   if (MemVT != MVT::v16i1) {
9444     SmallVector<SDValue, 16> Ops;
9445     for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++)
9446       Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
9447                                 DAG.getConstant(I, dl, MVT::i32)));
9448     for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
9449       Ops.push_back(DAG.getUNDEF(MVT::i32));
9450     Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
9451   }
9452   SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
9453   return DAG.getTruncStore(
9454       ST->getChain(), dl, GRP, ST->getBasePtr(),
9455       EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
9456       ST->getMemOperand());
9457 }
9458 
9459 static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG,
9460                           const ARMSubtarget *Subtarget) {
9461   StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
9462   EVT MemVT = ST->getMemoryVT();
9463   assert(ST->isUnindexed() && "Stores should be unindexed at this point.");
9464 
9465   if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
9466       !Subtarget->isThumb1Only() && ST->isVolatile()) {
9467     SDNode *N = Op.getNode();
9468     SDLoc dl(N);
9469 
9470     SDValue Lo = DAG.getNode(
9471         ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
9472         DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl,
9473                               MVT::i32));
9474     SDValue Hi = DAG.getNode(
9475         ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
9476         DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl,
9477                               MVT::i32));
9478 
9479     return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other),
9480                                    {ST->getChain(), Lo, Hi, ST->getBasePtr()},
9481                                    MemVT, ST->getMemOperand());
9482   } else if (Subtarget->hasMVEIntegerOps() &&
9483              ((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
9484                MemVT == MVT::v16i1))) {
9485     return LowerPredicateStore(Op, DAG);
9486   }
9487 
9488   return SDValue();
9489 }
9490 
9491 static bool isZeroVector(SDValue N) {
9492   return (ISD::isBuildVectorAllZeros(N.getNode()) ||
9493           (N->getOpcode() == ARMISD::VMOVIMM &&
9494            isNullConstant(N->getOperand(0))));
9495 }
9496 
9497 static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) {
9498   MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
9499   MVT VT = Op.getSimpleValueType();
9500   SDValue Mask = N->getMask();
9501   SDValue PassThru = N->getPassThru();
9502   SDLoc dl(Op);
9503 
9504   if (isZeroVector(PassThru))
9505     return Op;
9506 
9507   // MVE Masked loads use zero as the passthru value. Here we convert undef to
9508   // zero too, and other values are lowered to a select.
9509   SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
9510                                 DAG.getTargetConstant(0, dl, MVT::i32));
9511   SDValue NewLoad = DAG.getMaskedLoad(
9512       VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec,
9513       N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
9514       N->getExtensionType(), N->isExpandingLoad());
9515   SDValue Combo = NewLoad;
9516   bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST ||
9517                              PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
9518                             isZeroVector(PassThru->getOperand(0));
9519   if (!PassThru.isUndef() && !PassThruIsCastZero)
9520     Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
9521   return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
9522 }
9523 
9524 static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG,
9525                               const ARMSubtarget *ST) {
9526   if (!ST->hasMVEIntegerOps())
9527     return SDValue();
9528 
9529   SDLoc dl(Op);
9530   unsigned BaseOpcode = 0;
9531   switch (Op->getOpcode()) {
9532   default: llvm_unreachable("Expected VECREDUCE opcode");
9533   case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
9534   case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
9535   case ISD::VECREDUCE_MUL:  BaseOpcode = ISD::MUL; break;
9536   case ISD::VECREDUCE_AND:  BaseOpcode = ISD::AND; break;
9537   case ISD::VECREDUCE_OR:   BaseOpcode = ISD::OR; break;
9538   case ISD::VECREDUCE_XOR:  BaseOpcode = ISD::XOR; break;
9539   case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break;
9540   case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break;
9541   }
9542 
9543   SDValue Op0 = Op->getOperand(0);
9544   EVT VT = Op0.getValueType();
9545   EVT EltVT = VT.getVectorElementType();
9546   unsigned NumElts = VT.getVectorNumElements();
9547   unsigned NumActiveLanes = NumElts;
9548 
9549   assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
9550           NumActiveLanes == 2) &&
9551          "Only expected a power 2 vector size");
9552 
9553   // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements
9554   // allows us to easily extract vector elements from the lanes.
9555   while (NumActiveLanes > 4) {
9556     unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32;
9557     SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0);
9558     Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev);
9559     NumActiveLanes /= 2;
9560   }
9561 
9562   SDValue Res;
9563   if (NumActiveLanes == 4) {
9564     // The remaining 4 elements are summed sequentially
9565     SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
9566                               DAG.getConstant(0 * NumElts / 4, dl, MVT::i32));
9567     SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
9568                               DAG.getConstant(1 * NumElts / 4, dl, MVT::i32));
9569     SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
9570                               DAG.getConstant(2 * NumElts / 4, dl, MVT::i32));
9571     SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
9572                               DAG.getConstant(3 * NumElts / 4, dl, MVT::i32));
9573     SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
9574     SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags());
9575     Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags());
9576   } else {
9577     SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
9578                               DAG.getConstant(0, dl, MVT::i32));
9579     SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
9580                               DAG.getConstant(1, dl, MVT::i32));
9581     Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
9582   }
9583 
9584   // Result type may be wider than element type.
9585   if (EltVT != Op->getValueType(0))
9586     Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res);
9587   return Res;
9588 }
9589 
9590 static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG,
9591                                const ARMSubtarget *ST) {
9592   if (!ST->hasMVEFloatOps())
9593     return SDValue();
9594   return LowerVecReduce(Op, DAG, ST);
9595 }
9596 
9597 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
9598   if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
9599     // Acquire/Release load/store is not legal for targets without a dmb or
9600     // equivalent available.
9601     return SDValue();
9602 
9603   // Monotonic load/store is legal for all targets.
9604   return Op;
9605 }
9606 
9607 static void ReplaceREADCYCLECOUNTER(SDNode *N,
9608                                     SmallVectorImpl<SDValue> &Results,
9609                                     SelectionDAG &DAG,
9610                                     const ARMSubtarget *Subtarget) {
9611   SDLoc DL(N);
9612   // Under Power Management extensions, the cycle-count is:
9613   //    mrc p15, #0, <Rt>, c9, c13, #0
9614   SDValue Ops[] = { N->getOperand(0), // Chain
9615                     DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
9616                     DAG.getTargetConstant(15, DL, MVT::i32),
9617                     DAG.getTargetConstant(0, DL, MVT::i32),
9618                     DAG.getTargetConstant(9, DL, MVT::i32),
9619                     DAG.getTargetConstant(13, DL, MVT::i32),
9620                     DAG.getTargetConstant(0, DL, MVT::i32)
9621   };
9622 
9623   SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
9624                                  DAG.getVTList(MVT::i32, MVT::Other), Ops);
9625   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
9626                                 DAG.getConstant(0, DL, MVT::i32)));
9627   Results.push_back(Cycles32.getValue(1));
9628 }
9629 
9630 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
9631   SDLoc dl(V.getNode());
9632   SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32);
9633   SDValue VHi = DAG.getAnyExtOrTrunc(
9634       DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)),
9635       dl, MVT::i32);
9636   bool isBigEndian = DAG.getDataLayout().isBigEndian();
9637   if (isBigEndian)
9638     std::swap (VLo, VHi);
9639   SDValue RegClass =
9640       DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
9641   SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
9642   SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
9643   const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
9644   return SDValue(
9645       DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
9646 }
9647 
9648 static void ReplaceCMP_SWAP_64Results(SDNode *N,
9649                                        SmallVectorImpl<SDValue> & Results,
9650                                        SelectionDAG &DAG) {
9651   assert(N->getValueType(0) == MVT::i64 &&
9652          "AtomicCmpSwap on types less than 64 should be legal");
9653   SDValue Ops[] = {N->getOperand(1),
9654                    createGPRPairNode(DAG, N->getOperand(2)),
9655                    createGPRPairNode(DAG, N->getOperand(3)),
9656                    N->getOperand(0)};
9657   SDNode *CmpSwap = DAG.getMachineNode(
9658       ARM::CMP_SWAP_64, SDLoc(N),
9659       DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops);
9660 
9661   MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
9662   DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
9663 
9664   bool isBigEndian = DAG.getDataLayout().isBigEndian();
9665 
9666   SDValue Lo =
9667       DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
9668                                  SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
9669   SDValue Hi =
9670       DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
9671                                  SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
9672   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi));
9673   Results.push_back(SDValue(CmpSwap, 2));
9674 }
9675 
9676 SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const {
9677   SDLoc dl(Op);
9678   EVT VT = Op.getValueType();
9679   SDValue Chain = Op.getOperand(0);
9680   SDValue LHS = Op.getOperand(1);
9681   SDValue RHS = Op.getOperand(2);
9682   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get();
9683   bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
9684 
9685   // If we don't have instructions of this float type then soften to a libcall
9686   // and use SETCC instead.
9687   if (isUnsupportedFloatingType(LHS.getValueType())) {
9688     DAG.getTargetLoweringInfo().softenSetCCOperands(
9689       DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS, Chain, IsSignaling);
9690     if (!RHS.getNode()) {
9691       RHS = DAG.getConstant(0, dl, LHS.getValueType());
9692       CC = ISD::SETNE;
9693     }
9694     SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS,
9695                                  DAG.getCondCode(CC));
9696     return DAG.getMergeValues({Result, Chain}, dl);
9697   }
9698 
9699   ARMCC::CondCodes CondCode, CondCode2;
9700   FPCCToARMCC(CC, CondCode, CondCode2);
9701 
9702   // FIXME: Chain is not handled correctly here. Currently the FPSCR is implicit
9703   // in CMPFP and CMPFPE, but instead it should be made explicit by these
9704   // instructions using a chain instead of glue. This would also fix the problem
9705   // here (and also in LowerSELECT_CC) where we generate two comparisons when
9706   // CondCode2 != AL.
9707   SDValue True = DAG.getConstant(1, dl, VT);
9708   SDValue False =  DAG.getConstant(0, dl, VT);
9709   SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
9710   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
9711   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
9712   SDValue Result = getCMOV(dl, VT, False, True, ARMcc, CCR, Cmp, DAG);
9713   if (CondCode2 != ARMCC::AL) {
9714     ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
9715     Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
9716     Result = getCMOV(dl, VT, Result, True, ARMcc, CCR, Cmp, DAG);
9717   }
9718   return DAG.getMergeValues({Result, Chain}, dl);
9719 }
9720 
9721 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
9722   LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
9723   switch (Op.getOpcode()) {
9724   default: llvm_unreachable("Don't know how to custom lower this!");
9725   case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
9726   case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
9727   case ISD::BlockAddress:  return LowerBlockAddress(Op, DAG);
9728   case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
9729   case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
9730   case ISD::SELECT:        return LowerSELECT(Op, DAG);
9731   case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG);
9732   case ISD::BRCOND:        return LowerBRCOND(Op, DAG);
9733   case ISD::BR_CC:         return LowerBR_CC(Op, DAG);
9734   case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
9735   case ISD::VASTART:       return LowerVASTART(Op, DAG);
9736   case ISD::ATOMIC_FENCE:  return LowerATOMIC_FENCE(Op, DAG, Subtarget);
9737   case ISD::PREFETCH:      return LowerPREFETCH(Op, DAG, Subtarget);
9738   case ISD::SINT_TO_FP:
9739   case ISD::UINT_TO_FP:    return LowerINT_TO_FP(Op, DAG);
9740   case ISD::STRICT_FP_TO_SINT:
9741   case ISD::STRICT_FP_TO_UINT:
9742   case ISD::FP_TO_SINT:
9743   case ISD::FP_TO_UINT:    return LowerFP_TO_INT(Op, DAG);
9744   case ISD::FCOPYSIGN:     return LowerFCOPYSIGN(Op, DAG);
9745   case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
9746   case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
9747   case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
9748   case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
9749   case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
9750   case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget);
9751   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
9752                                                                Subtarget);
9753   case ISD::BITCAST:       return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
9754   case ISD::SHL:
9755   case ISD::SRL:
9756   case ISD::SRA:           return LowerShift(Op.getNode(), DAG, Subtarget);
9757   case ISD::SREM:          return LowerREM(Op.getNode(), DAG);
9758   case ISD::UREM:          return LowerREM(Op.getNode(), DAG);
9759   case ISD::SHL_PARTS:     return LowerShiftLeftParts(Op, DAG);
9760   case ISD::SRL_PARTS:
9761   case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
9762   case ISD::CTTZ:
9763   case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
9764   case ISD::CTPOP:         return LowerCTPOP(Op.getNode(), DAG, Subtarget);
9765   case ISD::SETCC:         return LowerVSETCC(Op, DAG, Subtarget);
9766   case ISD::SETCCCARRY:    return LowerSETCCCARRY(Op, DAG);
9767   case ISD::ConstantFP:    return LowerConstantFP(Op, DAG, Subtarget);
9768   case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG, Subtarget);
9769   case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
9770   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget);
9771   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
9772   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
9773   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
9774   case ISD::TRUNCATE:      return LowerTruncatei1(Op, DAG, Subtarget);
9775   case ISD::FLT_ROUNDS_:   return LowerFLT_ROUNDS_(Op, DAG);
9776   case ISD::MUL:           return LowerMUL(Op, DAG);
9777   case ISD::SDIV:
9778     if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
9779       return LowerDIV_Windows(Op, DAG, /* Signed */ true);
9780     return LowerSDIV(Op, DAG, Subtarget);
9781   case ISD::UDIV:
9782     if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
9783       return LowerDIV_Windows(Op, DAG, /* Signed */ false);
9784     return LowerUDIV(Op, DAG, Subtarget);
9785   case ISD::ADDCARRY:
9786   case ISD::SUBCARRY:      return LowerADDSUBCARRY(Op, DAG);
9787   case ISD::SADDO:
9788   case ISD::SSUBO:
9789     return LowerSignedALUO(Op, DAG);
9790   case ISD::UADDO:
9791   case ISD::USUBO:
9792     return LowerUnsignedALUO(Op, DAG);
9793   case ISD::SADDSAT:
9794   case ISD::SSUBSAT:
9795     return LowerSADDSUBSAT(Op, DAG, Subtarget);
9796   case ISD::LOAD:
9797     return LowerPredicateLoad(Op, DAG);
9798   case ISD::STORE:
9799     return LowerSTORE(Op, DAG, Subtarget);
9800   case ISD::MLOAD:
9801     return LowerMLOAD(Op, DAG);
9802   case ISD::VECREDUCE_MUL:
9803   case ISD::VECREDUCE_AND:
9804   case ISD::VECREDUCE_OR:
9805   case ISD::VECREDUCE_XOR:
9806     return LowerVecReduce(Op, DAG, Subtarget);
9807   case ISD::VECREDUCE_FADD:
9808   case ISD::VECREDUCE_FMUL:
9809   case ISD::VECREDUCE_FMIN:
9810   case ISD::VECREDUCE_FMAX:
9811     return LowerVecReduceF(Op, DAG, Subtarget);
9812   case ISD::ATOMIC_LOAD:
9813   case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
9814   case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
9815   case ISD::SDIVREM:
9816   case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
9817   case ISD::DYNAMIC_STACKALLOC:
9818     if (Subtarget->isTargetWindows())
9819       return LowerDYNAMIC_STACKALLOC(Op, DAG);
9820     llvm_unreachable("Don't know how to custom lower this!");
9821   case ISD::STRICT_FP_ROUND:
9822   case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
9823   case ISD::STRICT_FP_EXTEND:
9824   case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
9825   case ISD::STRICT_FSETCC:
9826   case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG);
9827   case ARMISD::WIN__DBZCHK: return SDValue();
9828   }
9829 }
9830 
9831 static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results,
9832                                  SelectionDAG &DAG) {
9833   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
9834   unsigned Opc = 0;
9835   if (IntNo == Intrinsic::arm_smlald)
9836     Opc = ARMISD::SMLALD;
9837   else if (IntNo == Intrinsic::arm_smlaldx)
9838     Opc = ARMISD::SMLALDX;
9839   else if (IntNo == Intrinsic::arm_smlsld)
9840     Opc = ARMISD::SMLSLD;
9841   else if (IntNo == Intrinsic::arm_smlsldx)
9842     Opc = ARMISD::SMLSLDX;
9843   else
9844     return;
9845 
9846   SDLoc dl(N);
9847   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
9848                            N->getOperand(3),
9849                            DAG.getConstant(0, dl, MVT::i32));
9850   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
9851                            N->getOperand(3),
9852                            DAG.getConstant(1, dl, MVT::i32));
9853 
9854   SDValue LongMul = DAG.getNode(Opc, dl,
9855                                 DAG.getVTList(MVT::i32, MVT::i32),
9856                                 N->getOperand(1), N->getOperand(2),
9857                                 Lo, Hi);
9858   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
9859                                 LongMul.getValue(0), LongMul.getValue(1)));
9860 }
9861 
9862 /// ReplaceNodeResults - Replace the results of node with an illegal result
9863 /// type with new values built out of custom code.
9864 void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
9865                                            SmallVectorImpl<SDValue> &Results,
9866                                            SelectionDAG &DAG) const {
9867   SDValue Res;
9868   switch (N->getOpcode()) {
9869   default:
9870     llvm_unreachable("Don't know how to custom expand this!");
9871   case ISD::READ_REGISTER:
9872     ExpandREAD_REGISTER(N, Results, DAG);
9873     break;
9874   case ISD::BITCAST:
9875     Res = ExpandBITCAST(N, DAG, Subtarget);
9876     break;
9877   case ISD::SRL:
9878   case ISD::SRA:
9879   case ISD::SHL:
9880     Res = Expand64BitShift(N, DAG, Subtarget);
9881     break;
9882   case ISD::SREM:
9883   case ISD::UREM:
9884     Res = LowerREM(N, DAG);
9885     break;
9886   case ISD::SDIVREM:
9887   case ISD::UDIVREM:
9888     Res = LowerDivRem(SDValue(N, 0), DAG);
9889     assert(Res.getNumOperands() == 2 && "DivRem needs two values");
9890     Results.push_back(Res.getValue(0));
9891     Results.push_back(Res.getValue(1));
9892     return;
9893   case ISD::SADDSAT:
9894   case ISD::SSUBSAT:
9895     Res = LowerSADDSUBSAT(SDValue(N, 0), DAG, Subtarget);
9896     break;
9897   case ISD::READCYCLECOUNTER:
9898     ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
9899     return;
9900   case ISD::UDIV:
9901   case ISD::SDIV:
9902     assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
9903     return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
9904                              Results);
9905   case ISD::ATOMIC_CMP_SWAP:
9906     ReplaceCMP_SWAP_64Results(N, Results, DAG);
9907     return;
9908   case ISD::INTRINSIC_WO_CHAIN:
9909     return ReplaceLongIntrinsic(N, Results, DAG);
9910   case ISD::ABS:
9911      lowerABS(N, Results, DAG);
9912      return ;
9913   case ISD::LOAD:
9914     LowerLOAD(N, Results, DAG);
9915     break;
9916   }
9917   if (Res.getNode())
9918     Results.push_back(Res);
9919 }
9920 
9921 //===----------------------------------------------------------------------===//
9922 //                           ARM Scheduler Hooks
9923 //===----------------------------------------------------------------------===//
9924 
9925 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
9926 /// registers the function context.
9927 void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
9928                                                MachineBasicBlock *MBB,
9929                                                MachineBasicBlock *DispatchBB,
9930                                                int FI) const {
9931   assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
9932          "ROPI/RWPI not currently supported with SjLj");
9933   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
9934   DebugLoc dl = MI.getDebugLoc();
9935   MachineFunction *MF = MBB->getParent();
9936   MachineRegisterInfo *MRI = &MF->getRegInfo();
9937   MachineConstantPool *MCP = MF->getConstantPool();
9938   ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
9939   const Function &F = MF->getFunction();
9940 
9941   bool isThumb = Subtarget->isThumb();
9942   bool isThumb2 = Subtarget->isThumb2();
9943 
9944   unsigned PCLabelId = AFI->createPICLabelUId();
9945   unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
9946   ARMConstantPoolValue *CPV =
9947     ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);
9948   unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4));
9949 
9950   const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
9951                                            : &ARM::GPRRegClass;
9952 
9953   // Grab constant pool and fixed stack memory operands.
9954   MachineMemOperand *CPMMO =
9955       MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
9956                                MachineMemOperand::MOLoad, 4, Align(4));
9957 
9958   MachineMemOperand *FIMMOSt =
9959       MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI),
9960                                MachineMemOperand::MOStore, 4, Align(4));
9961 
9962   // Load the address of the dispatch MBB into the jump buffer.
9963   if (isThumb2) {
9964     // Incoming value: jbuf
9965     //   ldr.n  r5, LCPI1_1
9966     //   orr    r5, r5, #1
9967     //   add    r5, pc
9968     //   str    r5, [$jbuf, #+4] ; &jbuf[1]
9969     Register NewVReg1 = MRI->createVirtualRegister(TRC);
9970     BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
9971         .addConstantPoolIndex(CPI)
9972         .addMemOperand(CPMMO)
9973         .add(predOps(ARMCC::AL));
9974     // Set the low bit because of thumb mode.
9975     Register NewVReg2 = MRI->createVirtualRegister(TRC);
9976     BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
9977         .addReg(NewVReg1, RegState::Kill)
9978         .addImm(0x01)
9979         .add(predOps(ARMCC::AL))
9980         .add(condCodeOp());
9981     Register NewVReg3 = MRI->createVirtualRegister(TRC);
9982     BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
9983       .addReg(NewVReg2, RegState::Kill)
9984       .addImm(PCLabelId);
9985     BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
9986         .addReg(NewVReg3, RegState::Kill)
9987         .addFrameIndex(FI)
9988         .addImm(36) // &jbuf[1] :: pc
9989         .addMemOperand(FIMMOSt)
9990         .add(predOps(ARMCC::AL));
9991   } else if (isThumb) {
9992     // Incoming value: jbuf
9993     //   ldr.n  r1, LCPI1_4
9994     //   add    r1, pc
9995     //   mov    r2, #1
9996     //   orrs   r1, r2
9997     //   add    r2, $jbuf, #+4 ; &jbuf[1]
9998     //   str    r1, [r2]
9999     Register NewVReg1 = MRI->createVirtualRegister(TRC);
10000     BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
10001         .addConstantPoolIndex(CPI)
10002         .addMemOperand(CPMMO)
10003         .add(predOps(ARMCC::AL));
10004     Register NewVReg2 = MRI->createVirtualRegister(TRC);
10005     BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
10006       .addReg(NewVReg1, RegState::Kill)
10007       .addImm(PCLabelId);
10008     // Set the low bit because of thumb mode.
10009     Register NewVReg3 = MRI->createVirtualRegister(TRC);
10010     BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
10011         .addReg(ARM::CPSR, RegState::Define)
10012         .addImm(1)
10013         .add(predOps(ARMCC::AL));
10014     Register NewVReg4 = MRI->createVirtualRegister(TRC);
10015     BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
10016         .addReg(ARM::CPSR, RegState::Define)
10017         .addReg(NewVReg2, RegState::Kill)
10018         .addReg(NewVReg3, RegState::Kill)
10019         .add(predOps(ARMCC::AL));
10020     Register NewVReg5 = MRI->createVirtualRegister(TRC);
10021     BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
10022             .addFrameIndex(FI)
10023             .addImm(36); // &jbuf[1] :: pc
10024     BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
10025         .addReg(NewVReg4, RegState::Kill)
10026         .addReg(NewVReg5, RegState::Kill)
10027         .addImm(0)
10028         .addMemOperand(FIMMOSt)
10029         .add(predOps(ARMCC::AL));
10030   } else {
10031     // Incoming value: jbuf
10032     //   ldr  r1, LCPI1_1
10033     //   add  r1, pc, r1
10034     //   str  r1, [$jbuf, #+4] ; &jbuf[1]
10035     Register NewVReg1 = MRI->createVirtualRegister(TRC);
10036     BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
10037         .addConstantPoolIndex(CPI)
10038         .addImm(0)
10039         .addMemOperand(CPMMO)
10040         .add(predOps(ARMCC::AL));
10041     Register NewVReg2 = MRI->createVirtualRegister(TRC);
10042     BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
10043         .addReg(NewVReg1, RegState::Kill)
10044         .addImm(PCLabelId)
10045         .add(predOps(ARMCC::AL));
10046     BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
10047         .addReg(NewVReg2, RegState::Kill)
10048         .addFrameIndex(FI)
10049         .addImm(36) // &jbuf[1] :: pc
10050         .addMemOperand(FIMMOSt)
10051         .add(predOps(ARMCC::AL));
10052   }
10053 }
10054 
10055 void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
10056                                               MachineBasicBlock *MBB) const {
10057   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10058   DebugLoc dl = MI.getDebugLoc();
10059   MachineFunction *MF = MBB->getParent();
10060   MachineRegisterInfo *MRI = &MF->getRegInfo();
10061   MachineFrameInfo &MFI = MF->getFrameInfo();
10062   int FI = MFI.getFunctionContextIndex();
10063 
10064   const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
10065                                                         : &ARM::GPRnopcRegClass;
10066 
10067   // Get a mapping of the call site numbers to all of the landing pads they're
10068   // associated with.
10069   DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad;
10070   unsigned MaxCSNum = 0;
10071   for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E;
10072        ++BB) {
10073     if (!BB->isEHPad()) continue;
10074 
10075     // FIXME: We should assert that the EH_LABEL is the first MI in the landing
10076     // pad.
10077     for (MachineBasicBlock::iterator
10078            II = BB->begin(), IE = BB->end(); II != IE; ++II) {
10079       if (!II->isEHLabel()) continue;
10080 
10081       MCSymbol *Sym = II->getOperand(0).getMCSymbol();
10082       if (!MF->hasCallSiteLandingPad(Sym)) continue;
10083 
10084       SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
10085       for (SmallVectorImpl<unsigned>::iterator
10086              CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end();
10087            CSI != CSE; ++CSI) {
10088         CallSiteNumToLPad[*CSI].push_back(&*BB);
10089         MaxCSNum = std::max(MaxCSNum, *CSI);
10090       }
10091       break;
10092     }
10093   }
10094 
10095   // Get an ordered list of the machine basic blocks for the jump table.
10096   std::vector<MachineBasicBlock*> LPadList;
10097   SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
10098   LPadList.reserve(CallSiteNumToLPad.size());
10099   for (unsigned I = 1; I <= MaxCSNum; ++I) {
10100     SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
10101     for (SmallVectorImpl<MachineBasicBlock*>::iterator
10102            II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) {
10103       LPadList.push_back(*II);
10104       InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end());
10105     }
10106   }
10107 
10108   assert(!LPadList.empty() &&
10109          "No landing pad destinations for the dispatch jump table!");
10110 
10111   // Create the jump table and associated information.
10112   MachineJumpTableInfo *JTI =
10113     MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
10114   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
10115 
10116   // Create the MBBs for the dispatch code.
10117 
10118   // Shove the dispatch's address into the return slot in the function context.
10119   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
10120   DispatchBB->setIsEHPad();
10121 
10122   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
10123   unsigned trap_opcode;
10124   if (Subtarget->isThumb())
10125     trap_opcode = ARM::tTRAP;
10126   else
10127     trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
10128 
10129   BuildMI(TrapBB, dl, TII->get(trap_opcode));
10130   DispatchBB->addSuccessor(TrapBB);
10131 
10132   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
10133   DispatchBB->addSuccessor(DispContBB);
10134 
10135   // Insert and MBBs.
10136   MF->insert(MF->end(), DispatchBB);
10137   MF->insert(MF->end(), DispContBB);
10138   MF->insert(MF->end(), TrapBB);
10139 
10140   // Insert code into the entry block that creates and registers the function
10141   // context.
10142   SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
10143 
10144   MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
10145       MachinePointerInfo::getFixedStack(*MF, FI),
10146       MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, Align(4));
10147 
10148   MachineInstrBuilder MIB;
10149   MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
10150 
10151   const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
10152   const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
10153 
10154   // Add a register mask with no preserved registers.  This results in all
10155   // registers being marked as clobbered. This can't work if the dispatch block
10156   // is in a Thumb1 function and is linked with ARM code which uses the FP
10157   // registers, as there is no way to preserve the FP registers in Thumb1 mode.
10158   MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF));
10159 
10160   bool IsPositionIndependent = isPositionIndependent();
10161   unsigned NumLPads = LPadList.size();
10162   if (Subtarget->isThumb2()) {
10163     Register NewVReg1 = MRI->createVirtualRegister(TRC);
10164     BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
10165         .addFrameIndex(FI)
10166         .addImm(4)
10167         .addMemOperand(FIMMOLd)
10168         .add(predOps(ARMCC::AL));
10169 
10170     if (NumLPads < 256) {
10171       BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
10172           .addReg(NewVReg1)
10173           .addImm(LPadList.size())
10174           .add(predOps(ARMCC::AL));
10175     } else {
10176       Register VReg1 = MRI->createVirtualRegister(TRC);
10177       BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
10178           .addImm(NumLPads & 0xFFFF)
10179           .add(predOps(ARMCC::AL));
10180 
10181       unsigned VReg2 = VReg1;
10182       if ((NumLPads & 0xFFFF0000) != 0) {
10183         VReg2 = MRI->createVirtualRegister(TRC);
10184         BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
10185             .addReg(VReg1)
10186             .addImm(NumLPads >> 16)
10187             .add(predOps(ARMCC::AL));
10188       }
10189 
10190       BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
10191           .addReg(NewVReg1)
10192           .addReg(VReg2)
10193           .add(predOps(ARMCC::AL));
10194     }
10195 
10196     BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
10197       .addMBB(TrapBB)
10198       .addImm(ARMCC::HI)
10199       .addReg(ARM::CPSR);
10200 
10201     Register NewVReg3 = MRI->createVirtualRegister(TRC);
10202     BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
10203         .addJumpTableIndex(MJTI)
10204         .add(predOps(ARMCC::AL));
10205 
10206     Register NewVReg4 = MRI->createVirtualRegister(TRC);
10207     BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
10208         .addReg(NewVReg3, RegState::Kill)
10209         .addReg(NewVReg1)
10210         .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
10211         .add(predOps(ARMCC::AL))
10212         .add(condCodeOp());
10213 
10214     BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
10215       .addReg(NewVReg4, RegState::Kill)
10216       .addReg(NewVReg1)
10217       .addJumpTableIndex(MJTI);
10218   } else if (Subtarget->isThumb()) {
10219     Register NewVReg1 = MRI->createVirtualRegister(TRC);
10220     BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
10221         .addFrameIndex(FI)
10222         .addImm(1)
10223         .addMemOperand(FIMMOLd)
10224         .add(predOps(ARMCC::AL));
10225 
10226     if (NumLPads < 256) {
10227       BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
10228           .addReg(NewVReg1)
10229           .addImm(NumLPads)
10230           .add(predOps(ARMCC::AL));
10231     } else {
10232       MachineConstantPool *ConstantPool = MF->getConstantPool();
10233       Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
10234       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
10235 
10236       // MachineConstantPool wants an explicit alignment.
10237       Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
10238       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
10239 
10240       Register VReg1 = MRI->createVirtualRegister(TRC);
10241       BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
10242           .addReg(VReg1, RegState::Define)
10243           .addConstantPoolIndex(Idx)
10244           .add(predOps(ARMCC::AL));
10245       BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
10246           .addReg(NewVReg1)
10247           .addReg(VReg1)
10248           .add(predOps(ARMCC::AL));
10249     }
10250 
10251     BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
10252       .addMBB(TrapBB)
10253       .addImm(ARMCC::HI)
10254       .addReg(ARM::CPSR);
10255 
10256     Register NewVReg2 = MRI->createVirtualRegister(TRC);
10257     BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
10258         .addReg(ARM::CPSR, RegState::Define)
10259         .addReg(NewVReg1)
10260         .addImm(2)
10261         .add(predOps(ARMCC::AL));
10262 
10263     Register NewVReg3 = MRI->createVirtualRegister(TRC);
10264     BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
10265         .addJumpTableIndex(MJTI)
10266         .add(predOps(ARMCC::AL));
10267 
10268     Register NewVReg4 = MRI->createVirtualRegister(TRC);
10269     BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
10270         .addReg(ARM::CPSR, RegState::Define)
10271         .addReg(NewVReg2, RegState::Kill)
10272         .addReg(NewVReg3)
10273         .add(predOps(ARMCC::AL));
10274 
10275     MachineMemOperand *JTMMOLd =
10276         MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
10277                                  MachineMemOperand::MOLoad, 4, Align(4));
10278 
10279     Register NewVReg5 = MRI->createVirtualRegister(TRC);
10280     BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
10281         .addReg(NewVReg4, RegState::Kill)
10282         .addImm(0)
10283         .addMemOperand(JTMMOLd)
10284         .add(predOps(ARMCC::AL));
10285 
10286     unsigned NewVReg6 = NewVReg5;
10287     if (IsPositionIndependent) {
10288       NewVReg6 = MRI->createVirtualRegister(TRC);
10289       BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
10290           .addReg(ARM::CPSR, RegState::Define)
10291           .addReg(NewVReg5, RegState::Kill)
10292           .addReg(NewVReg3)
10293           .add(predOps(ARMCC::AL));
10294     }
10295 
10296     BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
10297       .addReg(NewVReg6, RegState::Kill)
10298       .addJumpTableIndex(MJTI);
10299   } else {
10300     Register NewVReg1 = MRI->createVirtualRegister(TRC);
10301     BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
10302         .addFrameIndex(FI)
10303         .addImm(4)
10304         .addMemOperand(FIMMOLd)
10305         .add(predOps(ARMCC::AL));
10306 
10307     if (NumLPads < 256) {
10308       BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
10309           .addReg(NewVReg1)
10310           .addImm(NumLPads)
10311           .add(predOps(ARMCC::AL));
10312     } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
10313       Register VReg1 = MRI->createVirtualRegister(TRC);
10314       BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
10315           .addImm(NumLPads & 0xFFFF)
10316           .add(predOps(ARMCC::AL));
10317 
10318       unsigned VReg2 = VReg1;
10319       if ((NumLPads & 0xFFFF0000) != 0) {
10320         VReg2 = MRI->createVirtualRegister(TRC);
10321         BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
10322             .addReg(VReg1)
10323             .addImm(NumLPads >> 16)
10324             .add(predOps(ARMCC::AL));
10325       }
10326 
10327       BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
10328           .addReg(NewVReg1)
10329           .addReg(VReg2)
10330           .add(predOps(ARMCC::AL));
10331     } else {
10332       MachineConstantPool *ConstantPool = MF->getConstantPool();
10333       Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
10334       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
10335 
10336       // MachineConstantPool wants an explicit alignment.
10337       Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
10338       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
10339 
10340       Register VReg1 = MRI->createVirtualRegister(TRC);
10341       BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
10342           .addReg(VReg1, RegState::Define)
10343           .addConstantPoolIndex(Idx)
10344           .addImm(0)
10345           .add(predOps(ARMCC::AL));
10346       BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
10347           .addReg(NewVReg1)
10348           .addReg(VReg1, RegState::Kill)
10349           .add(predOps(ARMCC::AL));
10350     }
10351 
10352     BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
10353       .addMBB(TrapBB)
10354       .addImm(ARMCC::HI)
10355       .addReg(ARM::CPSR);
10356 
10357     Register NewVReg3 = MRI->createVirtualRegister(TRC);
10358     BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
10359         .addReg(NewVReg1)
10360         .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
10361         .add(predOps(ARMCC::AL))
10362         .add(condCodeOp());
10363     Register NewVReg4 = MRI->createVirtualRegister(TRC);
10364     BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
10365         .addJumpTableIndex(MJTI)
10366         .add(predOps(ARMCC::AL));
10367 
10368     MachineMemOperand *JTMMOLd =
10369         MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
10370                                  MachineMemOperand::MOLoad, 4, Align(4));
10371     Register NewVReg5 = MRI->createVirtualRegister(TRC);
10372     BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
10373         .addReg(NewVReg3, RegState::Kill)
10374         .addReg(NewVReg4)
10375         .addImm(0)
10376         .addMemOperand(JTMMOLd)
10377         .add(predOps(ARMCC::AL));
10378 
10379     if (IsPositionIndependent) {
10380       BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
10381         .addReg(NewVReg5, RegState::Kill)
10382         .addReg(NewVReg4)
10383         .addJumpTableIndex(MJTI);
10384     } else {
10385       BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
10386         .addReg(NewVReg5, RegState::Kill)
10387         .addJumpTableIndex(MJTI);
10388     }
10389   }
10390 
10391   // Add the jump table entries as successors to the MBB.
10392   SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
10393   for (std::vector<MachineBasicBlock*>::iterator
10394          I = LPadList.begin(), E = LPadList.end(); I != E; ++I) {
10395     MachineBasicBlock *CurMBB = *I;
10396     if (SeenMBBs.insert(CurMBB).second)
10397       DispContBB->addSuccessor(CurMBB);
10398   }
10399 
10400   // N.B. the order the invoke BBs are processed in doesn't matter here.
10401   const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
10402   SmallVector<MachineBasicBlock*, 64> MBBLPads;
10403   for (MachineBasicBlock *BB : InvokeBBs) {
10404 
10405     // Remove the landing pad successor from the invoke block and replace it
10406     // with the new dispatch block.
10407     SmallVector<MachineBasicBlock*, 4> Successors(BB->successors());
10408     while (!Successors.empty()) {
10409       MachineBasicBlock *SMBB = Successors.pop_back_val();
10410       if (SMBB->isEHPad()) {
10411         BB->removeSuccessor(SMBB);
10412         MBBLPads.push_back(SMBB);
10413       }
10414     }
10415 
10416     BB->addSuccessor(DispatchBB, BranchProbability::getZero());
10417     BB->normalizeSuccProbs();
10418 
10419     // Find the invoke call and mark all of the callee-saved registers as
10420     // 'implicit defined' so that they're spilled. This prevents code from
10421     // moving instructions to before the EH block, where they will never be
10422     // executed.
10423     for (MachineBasicBlock::reverse_iterator
10424            II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
10425       if (!II->isCall()) continue;
10426 
10427       DenseMap<unsigned, bool> DefRegs;
10428       for (MachineInstr::mop_iterator
10429              OI = II->operands_begin(), OE = II->operands_end();
10430            OI != OE; ++OI) {
10431         if (!OI->isReg()) continue;
10432         DefRegs[OI->getReg()] = true;
10433       }
10434 
10435       MachineInstrBuilder MIB(*MF, &*II);
10436 
10437       for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
10438         unsigned Reg = SavedRegs[i];
10439         if (Subtarget->isThumb2() &&
10440             !ARM::tGPRRegClass.contains(Reg) &&
10441             !ARM::hGPRRegClass.contains(Reg))
10442           continue;
10443         if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
10444           continue;
10445         if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
10446           continue;
10447         if (!DefRegs[Reg])
10448           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
10449       }
10450 
10451       break;
10452     }
10453   }
10454 
10455   // Mark all former landing pads as non-landing pads. The dispatch is the only
10456   // landing pad now.
10457   for (SmallVectorImpl<MachineBasicBlock*>::iterator
10458          I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I)
10459     (*I)->setIsEHPad(false);
10460 
10461   // The instruction is gone now.
10462   MI.eraseFromParent();
10463 }
10464 
10465 static
10466 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
10467   for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
10468        E = MBB->succ_end(); I != E; ++I)
10469     if (*I != Succ)
10470       return *I;
10471   llvm_unreachable("Expecting a BB with two successors!");
10472 }
10473 
10474 /// Return the load opcode for a given load size. If load size >= 8,
10475 /// neon opcode will be returned.
10476 static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
10477   if (LdSize >= 8)
10478     return LdSize == 16 ? ARM::VLD1q32wb_fixed
10479                         : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
10480   if (IsThumb1)
10481     return LdSize == 4 ? ARM::tLDRi
10482                        : LdSize == 2 ? ARM::tLDRHi
10483                                      : LdSize == 1 ? ARM::tLDRBi : 0;
10484   if (IsThumb2)
10485     return LdSize == 4 ? ARM::t2LDR_POST
10486                        : LdSize == 2 ? ARM::t2LDRH_POST
10487                                      : LdSize == 1 ? ARM::t2LDRB_POST : 0;
10488   return LdSize == 4 ? ARM::LDR_POST_IMM
10489                      : LdSize == 2 ? ARM::LDRH_POST
10490                                    : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
10491 }
10492 
10493 /// Return the store opcode for a given store size. If store size >= 8,
10494 /// neon opcode will be returned.
10495 static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
10496   if (StSize >= 8)
10497     return StSize == 16 ? ARM::VST1q32wb_fixed
10498                         : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
10499   if (IsThumb1)
10500     return StSize == 4 ? ARM::tSTRi
10501                        : StSize == 2 ? ARM::tSTRHi
10502                                      : StSize == 1 ? ARM::tSTRBi : 0;
10503   if (IsThumb2)
10504     return StSize == 4 ? ARM::t2STR_POST
10505                        : StSize == 2 ? ARM::t2STRH_POST
10506                                      : StSize == 1 ? ARM::t2STRB_POST : 0;
10507   return StSize == 4 ? ARM::STR_POST_IMM
10508                      : StSize == 2 ? ARM::STRH_POST
10509                                    : StSize == 1 ? ARM::STRB_POST_IMM : 0;
10510 }
10511 
10512 /// Emit a post-increment load operation with given size. The instructions
10513 /// will be added to BB at Pos.
10514 static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
10515                        const TargetInstrInfo *TII, const DebugLoc &dl,
10516                        unsigned LdSize, unsigned Data, unsigned AddrIn,
10517                        unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
10518   unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
10519   assert(LdOpc != 0 && "Should have a load opcode");
10520   if (LdSize >= 8) {
10521     BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
10522         .addReg(AddrOut, RegState::Define)
10523         .addReg(AddrIn)
10524         .addImm(0)
10525         .add(predOps(ARMCC::AL));
10526   } else if (IsThumb1) {
10527     // load + update AddrIn
10528     BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
10529         .addReg(AddrIn)
10530         .addImm(0)
10531         .add(predOps(ARMCC::AL));
10532     BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
10533         .add(t1CondCodeOp())
10534         .addReg(AddrIn)
10535         .addImm(LdSize)
10536         .add(predOps(ARMCC::AL));
10537   } else if (IsThumb2) {
10538     BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
10539         .addReg(AddrOut, RegState::Define)
10540         .addReg(AddrIn)
10541         .addImm(LdSize)
10542         .add(predOps(ARMCC::AL));
10543   } else { // arm
10544     BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
10545         .addReg(AddrOut, RegState::Define)
10546         .addReg(AddrIn)
10547         .addReg(0)
10548         .addImm(LdSize)
10549         .add(predOps(ARMCC::AL));
10550   }
10551 }
10552 
10553 /// Emit a post-increment store operation with given size. The instructions
10554 /// will be added to BB at Pos.
10555 static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
10556                        const TargetInstrInfo *TII, const DebugLoc &dl,
10557                        unsigned StSize, unsigned Data, unsigned AddrIn,
10558                        unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
10559   unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
10560   assert(StOpc != 0 && "Should have a store opcode");
10561   if (StSize >= 8) {
10562     BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
10563         .addReg(AddrIn)
10564         .addImm(0)
10565         .addReg(Data)
10566         .add(predOps(ARMCC::AL));
10567   } else if (IsThumb1) {
10568     // store + update AddrIn
10569     BuildMI(*BB, Pos, dl, TII->get(StOpc))
10570         .addReg(Data)
10571         .addReg(AddrIn)
10572         .addImm(0)
10573         .add(predOps(ARMCC::AL));
10574     BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
10575         .add(t1CondCodeOp())
10576         .addReg(AddrIn)
10577         .addImm(StSize)
10578         .add(predOps(ARMCC::AL));
10579   } else if (IsThumb2) {
10580     BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
10581         .addReg(Data)
10582         .addReg(AddrIn)
10583         .addImm(StSize)
10584         .add(predOps(ARMCC::AL));
10585   } else { // arm
10586     BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
10587         .addReg(Data)
10588         .addReg(AddrIn)
10589         .addReg(0)
10590         .addImm(StSize)
10591         .add(predOps(ARMCC::AL));
10592   }
10593 }
10594 
10595 MachineBasicBlock *
10596 ARMTargetLowering::EmitStructByval(MachineInstr &MI,
10597                                    MachineBasicBlock *BB) const {
10598   // This pseudo instruction has 3 operands: dst, src, size
10599   // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
10600   // Otherwise, we will generate unrolled scalar copies.
10601   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10602   const BasicBlock *LLVM_BB = BB->getBasicBlock();
10603   MachineFunction::iterator It = ++BB->getIterator();
10604 
10605   Register dest = MI.getOperand(0).getReg();
10606   Register src = MI.getOperand(1).getReg();
10607   unsigned SizeVal = MI.getOperand(2).getImm();
10608   unsigned Alignment = MI.getOperand(3).getImm();
10609   DebugLoc dl = MI.getDebugLoc();
10610 
10611   MachineFunction *MF = BB->getParent();
10612   MachineRegisterInfo &MRI = MF->getRegInfo();
10613   unsigned UnitSize = 0;
10614   const TargetRegisterClass *TRC = nullptr;
10615   const TargetRegisterClass *VecTRC = nullptr;
10616 
10617   bool IsThumb1 = Subtarget->isThumb1Only();
10618   bool IsThumb2 = Subtarget->isThumb2();
10619   bool IsThumb = Subtarget->isThumb();
10620 
10621   if (Alignment & 1) {
10622     UnitSize = 1;
10623   } else if (Alignment & 2) {
10624     UnitSize = 2;
10625   } else {
10626     // Check whether we can use NEON instructions.
10627     if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
10628         Subtarget->hasNEON()) {
10629       if ((Alignment % 16 == 0) && SizeVal >= 16)
10630         UnitSize = 16;
10631       else if ((Alignment % 8 == 0) && SizeVal >= 8)
10632         UnitSize = 8;
10633     }
10634     // Can't use NEON instructions.
10635     if (UnitSize == 0)
10636       UnitSize = 4;
10637   }
10638 
10639   // Select the correct opcode and register class for unit size load/store
10640   bool IsNeon = UnitSize >= 8;
10641   TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
10642   if (IsNeon)
10643     VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
10644                             : UnitSize == 8 ? &ARM::DPRRegClass
10645                                             : nullptr;
10646 
10647   unsigned BytesLeft = SizeVal % UnitSize;
10648   unsigned LoopSize = SizeVal - BytesLeft;
10649 
10650   if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
10651     // Use LDR and STR to copy.
10652     // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
10653     // [destOut] = STR_POST(scratch, destIn, UnitSize)
10654     unsigned srcIn = src;
10655     unsigned destIn = dest;
10656     for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
10657       Register srcOut = MRI.createVirtualRegister(TRC);
10658       Register destOut = MRI.createVirtualRegister(TRC);
10659       Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
10660       emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
10661                  IsThumb1, IsThumb2);
10662       emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
10663                  IsThumb1, IsThumb2);
10664       srcIn = srcOut;
10665       destIn = destOut;
10666     }
10667 
10668     // Handle the leftover bytes with LDRB and STRB.
10669     // [scratch, srcOut] = LDRB_POST(srcIn, 1)
10670     // [destOut] = STRB_POST(scratch, destIn, 1)
10671     for (unsigned i = 0; i < BytesLeft; i++) {
10672       Register srcOut = MRI.createVirtualRegister(TRC);
10673       Register destOut = MRI.createVirtualRegister(TRC);
10674       Register scratch = MRI.createVirtualRegister(TRC);
10675       emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
10676                  IsThumb1, IsThumb2);
10677       emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
10678                  IsThumb1, IsThumb2);
10679       srcIn = srcOut;
10680       destIn = destOut;
10681     }
10682     MI.eraseFromParent(); // The instruction is gone now.
10683     return BB;
10684   }
10685 
10686   // Expand the pseudo op to a loop.
10687   // thisMBB:
10688   //   ...
10689   //   movw varEnd, # --> with thumb2
10690   //   movt varEnd, #
10691   //   ldrcp varEnd, idx --> without thumb2
10692   //   fallthrough --> loopMBB
10693   // loopMBB:
10694   //   PHI varPhi, varEnd, varLoop
10695   //   PHI srcPhi, src, srcLoop
10696   //   PHI destPhi, dst, destLoop
10697   //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
10698   //   [destLoop] = STR_POST(scratch, destPhi, UnitSize)
10699   //   subs varLoop, varPhi, #UnitSize
10700   //   bne loopMBB
10701   //   fallthrough --> exitMBB
10702   // exitMBB:
10703   //   epilogue to handle left-over bytes
10704   //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
10705   //   [destOut] = STRB_POST(scratch, destLoop, 1)
10706   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
10707   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
10708   MF->insert(It, loopMBB);
10709   MF->insert(It, exitMBB);
10710 
10711   // Transfer the remainder of BB and its successor edges to exitMBB.
10712   exitMBB->splice(exitMBB->begin(), BB,
10713                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
10714   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
10715 
10716   // Load an immediate to varEnd.
10717   Register varEnd = MRI.createVirtualRegister(TRC);
10718   if (Subtarget->useMovt()) {
10719     unsigned Vtmp = varEnd;
10720     if ((LoopSize & 0xFFFF0000) != 0)
10721       Vtmp = MRI.createVirtualRegister(TRC);
10722     BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp)
10723         .addImm(LoopSize & 0xFFFF)
10724         .add(predOps(ARMCC::AL));
10725 
10726     if ((LoopSize & 0xFFFF0000) != 0)
10727       BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd)
10728           .addReg(Vtmp)
10729           .addImm(LoopSize >> 16)
10730           .add(predOps(ARMCC::AL));
10731   } else {
10732     MachineConstantPool *ConstantPool = MF->getConstantPool();
10733     Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
10734     const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
10735 
10736     // MachineConstantPool wants an explicit alignment.
10737     Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
10738     unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
10739     MachineMemOperand *CPMMO =
10740         MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
10741                                  MachineMemOperand::MOLoad, 4, Align(4));
10742 
10743     if (IsThumb)
10744       BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
10745           .addReg(varEnd, RegState::Define)
10746           .addConstantPoolIndex(Idx)
10747           .add(predOps(ARMCC::AL))
10748           .addMemOperand(CPMMO);
10749     else
10750       BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
10751           .addReg(varEnd, RegState::Define)
10752           .addConstantPoolIndex(Idx)
10753           .addImm(0)
10754           .add(predOps(ARMCC::AL))
10755           .addMemOperand(CPMMO);
10756   }
10757   BB->addSuccessor(loopMBB);
10758 
10759   // Generate the loop body:
10760   //   varPhi = PHI(varLoop, varEnd)
10761   //   srcPhi = PHI(srcLoop, src)
10762   //   destPhi = PHI(destLoop, dst)
10763   MachineBasicBlock *entryBB = BB;
10764   BB = loopMBB;
10765   Register varLoop = MRI.createVirtualRegister(TRC);
10766   Register varPhi = MRI.createVirtualRegister(TRC);
10767   Register srcLoop = MRI.createVirtualRegister(TRC);
10768   Register srcPhi = MRI.createVirtualRegister(TRC);
10769   Register destLoop = MRI.createVirtualRegister(TRC);
10770   Register destPhi = MRI.createVirtualRegister(TRC);
10771 
10772   BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
10773     .addReg(varLoop).addMBB(loopMBB)
10774     .addReg(varEnd).addMBB(entryBB);
10775   BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
10776     .addReg(srcLoop).addMBB(loopMBB)
10777     .addReg(src).addMBB(entryBB);
10778   BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
10779     .addReg(destLoop).addMBB(loopMBB)
10780     .addReg(dest).addMBB(entryBB);
10781 
10782   //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
10783   //   [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
10784   Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
10785   emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
10786              IsThumb1, IsThumb2);
10787   emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
10788              IsThumb1, IsThumb2);
10789 
10790   // Decrement loop variable by UnitSize.
10791   if (IsThumb1) {
10792     BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
10793         .add(t1CondCodeOp())
10794         .addReg(varPhi)
10795         .addImm(UnitSize)
10796         .add(predOps(ARMCC::AL));
10797   } else {
10798     MachineInstrBuilder MIB =
10799         BuildMI(*BB, BB->end(), dl,
10800                 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
10801     MIB.addReg(varPhi)
10802         .addImm(UnitSize)
10803         .add(predOps(ARMCC::AL))
10804         .add(condCodeOp());
10805     MIB->getOperand(5).setReg(ARM::CPSR);
10806     MIB->getOperand(5).setIsDef(true);
10807   }
10808   BuildMI(*BB, BB->end(), dl,
10809           TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
10810       .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
10811 
10812   // loopMBB can loop back to loopMBB or fall through to exitMBB.
10813   BB->addSuccessor(loopMBB);
10814   BB->addSuccessor(exitMBB);
10815 
10816   // Add epilogue to handle BytesLeft.
10817   BB = exitMBB;
10818   auto StartOfExit = exitMBB->begin();
10819 
10820   //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
10821   //   [destOut] = STRB_POST(scratch, destLoop, 1)
10822   unsigned srcIn = srcLoop;
10823   unsigned destIn = destLoop;
10824   for (unsigned i = 0; i < BytesLeft; i++) {
10825     Register srcOut = MRI.createVirtualRegister(TRC);
10826     Register destOut = MRI.createVirtualRegister(TRC);
10827     Register scratch = MRI.createVirtualRegister(TRC);
10828     emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
10829                IsThumb1, IsThumb2);
10830     emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
10831                IsThumb1, IsThumb2);
10832     srcIn = srcOut;
10833     destIn = destOut;
10834   }
10835 
10836   MI.eraseFromParent(); // The instruction is gone now.
10837   return BB;
10838 }
10839 
10840 MachineBasicBlock *
10841 ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
10842                                        MachineBasicBlock *MBB) const {
10843   const TargetMachine &TM = getTargetMachine();
10844   const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
10845   DebugLoc DL = MI.getDebugLoc();
10846 
10847   assert(Subtarget->isTargetWindows() &&
10848          "__chkstk is only supported on Windows");
10849   assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
10850 
10851   // __chkstk takes the number of words to allocate on the stack in R4, and
10852   // returns the stack adjustment in number of bytes in R4.  This will not
10853   // clober any other registers (other than the obvious lr).
10854   //
10855   // Although, technically, IP should be considered a register which may be
10856   // clobbered, the call itself will not touch it.  Windows on ARM is a pure
10857   // thumb-2 environment, so there is no interworking required.  As a result, we
10858   // do not expect a veneer to be emitted by the linker, clobbering IP.
10859   //
10860   // Each module receives its own copy of __chkstk, so no import thunk is
10861   // required, again, ensuring that IP is not clobbered.
10862   //
10863   // Finally, although some linkers may theoretically provide a trampoline for
10864   // out of range calls (which is quite common due to a 32M range limitation of
10865   // branches for Thumb), we can generate the long-call version via
10866   // -mcmodel=large, alleviating the need for the trampoline which may clobber
10867   // IP.
10868 
10869   switch (TM.getCodeModel()) {
10870   case CodeModel::Tiny:
10871     llvm_unreachable("Tiny code model not available on ARM.");
10872   case CodeModel::Small:
10873   case CodeModel::Medium:
10874   case CodeModel::Kernel:
10875     BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
10876         .add(predOps(ARMCC::AL))
10877         .addExternalSymbol("__chkstk")
10878         .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
10879         .addReg(ARM::R4, RegState::Implicit | RegState::Define)
10880         .addReg(ARM::R12,
10881                 RegState::Implicit | RegState::Define | RegState::Dead)
10882         .addReg(ARM::CPSR,
10883                 RegState::Implicit | RegState::Define | RegState::Dead);
10884     break;
10885   case CodeModel::Large: {
10886     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
10887     Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
10888 
10889     BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
10890       .addExternalSymbol("__chkstk");
10891     BuildMI(*MBB, MI, DL, TII.get(gettBLXrOpcode(*MBB->getParent())))
10892         .add(predOps(ARMCC::AL))
10893         .addReg(Reg, RegState::Kill)
10894         .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
10895         .addReg(ARM::R4, RegState::Implicit | RegState::Define)
10896         .addReg(ARM::R12,
10897                 RegState::Implicit | RegState::Define | RegState::Dead)
10898         .addReg(ARM::CPSR,
10899                 RegState::Implicit | RegState::Define | RegState::Dead);
10900     break;
10901   }
10902   }
10903 
10904   BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
10905       .addReg(ARM::SP, RegState::Kill)
10906       .addReg(ARM::R4, RegState::Kill)
10907       .setMIFlags(MachineInstr::FrameSetup)
10908       .add(predOps(ARMCC::AL))
10909       .add(condCodeOp());
10910 
10911   MI.eraseFromParent();
10912   return MBB;
10913 }
10914 
10915 MachineBasicBlock *
10916 ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
10917                                        MachineBasicBlock *MBB) const {
10918   DebugLoc DL = MI.getDebugLoc();
10919   MachineFunction *MF = MBB->getParent();
10920   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10921 
10922   MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
10923   MF->insert(++MBB->getIterator(), ContBB);
10924   ContBB->splice(ContBB->begin(), MBB,
10925                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
10926   ContBB->transferSuccessorsAndUpdatePHIs(MBB);
10927   MBB->addSuccessor(ContBB);
10928 
10929   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
10930   BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
10931   MF->push_back(TrapBB);
10932   MBB->addSuccessor(TrapBB);
10933 
10934   BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
10935       .addReg(MI.getOperand(0).getReg())
10936       .addImm(0)
10937       .add(predOps(ARMCC::AL));
10938   BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
10939       .addMBB(TrapBB)
10940       .addImm(ARMCC::EQ)
10941       .addReg(ARM::CPSR);
10942 
10943   MI.eraseFromParent();
10944   return ContBB;
10945 }
10946 
10947 // The CPSR operand of SelectItr might be missing a kill marker
10948 // because there were multiple uses of CPSR, and ISel didn't know
10949 // which to mark. Figure out whether SelectItr should have had a
10950 // kill marker, and set it if it should. Returns the correct kill
10951 // marker value.
10952 static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr,
10953                                    MachineBasicBlock* BB,
10954                                    const TargetRegisterInfo* TRI) {
10955   // Scan forward through BB for a use/def of CPSR.
10956   MachineBasicBlock::iterator miI(std::next(SelectItr));
10957   for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
10958     const MachineInstr& mi = *miI;
10959     if (mi.readsRegister(ARM::CPSR))
10960       return false;
10961     if (mi.definesRegister(ARM::CPSR))
10962       break; // Should have kill-flag - update below.
10963   }
10964 
10965   // If we hit the end of the block, check whether CPSR is live into a
10966   // successor.
10967   if (miI == BB->end()) {
10968     for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
10969                                           sEnd = BB->succ_end();
10970          sItr != sEnd; ++sItr) {
10971       MachineBasicBlock* succ = *sItr;
10972       if (succ->isLiveIn(ARM::CPSR))
10973         return false;
10974     }
10975   }
10976 
10977   // We found a def, or hit the end of the basic block and CPSR wasn't live
10978   // out. SelectMI should have a kill flag on CPSR.
10979   SelectItr->addRegisterKilled(ARM::CPSR, TRI);
10980   return true;
10981 }
10982 
10983 MachineBasicBlock *
10984 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
10985                                                MachineBasicBlock *BB) const {
10986   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10987   DebugLoc dl = MI.getDebugLoc();
10988   bool isThumb2 = Subtarget->isThumb2();
10989   switch (MI.getOpcode()) {
10990   default: {
10991     MI.print(errs());
10992     llvm_unreachable("Unexpected instr type to insert");
10993   }
10994 
10995   // Thumb1 post-indexed loads are really just single-register LDMs.
10996   case ARM::tLDR_postidx: {
10997     MachineOperand Def(MI.getOperand(1));
10998     BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
10999         .add(Def)  // Rn_wb
11000         .add(MI.getOperand(2))  // Rn
11001         .add(MI.getOperand(3))  // PredImm
11002         .add(MI.getOperand(4))  // PredReg
11003         .add(MI.getOperand(0))  // Rt
11004         .cloneMemRefs(MI);
11005     MI.eraseFromParent();
11006     return BB;
11007   }
11008 
11009   // The Thumb2 pre-indexed stores have the same MI operands, they just
11010   // define them differently in the .td files from the isel patterns, so
11011   // they need pseudos.
11012   case ARM::t2STR_preidx:
11013     MI.setDesc(TII->get(ARM::t2STR_PRE));
11014     return BB;
11015   case ARM::t2STRB_preidx:
11016     MI.setDesc(TII->get(ARM::t2STRB_PRE));
11017     return BB;
11018   case ARM::t2STRH_preidx:
11019     MI.setDesc(TII->get(ARM::t2STRH_PRE));
11020     return BB;
11021 
11022   case ARM::STRi_preidx:
11023   case ARM::STRBi_preidx: {
11024     unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
11025                                                          : ARM::STRB_PRE_IMM;
11026     // Decode the offset.
11027     unsigned Offset = MI.getOperand(4).getImm();
11028     bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
11029     Offset = ARM_AM::getAM2Offset(Offset);
11030     if (isSub)
11031       Offset = -Offset;
11032 
11033     MachineMemOperand *MMO = *MI.memoperands_begin();
11034     BuildMI(*BB, MI, dl, TII->get(NewOpc))
11035         .add(MI.getOperand(0)) // Rn_wb
11036         .add(MI.getOperand(1)) // Rt
11037         .add(MI.getOperand(2)) // Rn
11038         .addImm(Offset)        // offset (skip GPR==zero_reg)
11039         .add(MI.getOperand(5)) // pred
11040         .add(MI.getOperand(6))
11041         .addMemOperand(MMO);
11042     MI.eraseFromParent();
11043     return BB;
11044   }
11045   case ARM::STRr_preidx:
11046   case ARM::STRBr_preidx:
11047   case ARM::STRH_preidx: {
11048     unsigned NewOpc;
11049     switch (MI.getOpcode()) {
11050     default: llvm_unreachable("unexpected opcode!");
11051     case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
11052     case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
11053     case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
11054     }
11055     MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
11056     for (unsigned i = 0; i < MI.getNumOperands(); ++i)
11057       MIB.add(MI.getOperand(i));
11058     MI.eraseFromParent();
11059     return BB;
11060   }
11061 
11062   case ARM::tMOVCCr_pseudo: {
11063     // To "insert" a SELECT_CC instruction, we actually have to insert the
11064     // diamond control-flow pattern.  The incoming instruction knows the
11065     // destination vreg to set, the condition code register to branch on, the
11066     // true/false values to select between, and a branch opcode to use.
11067     const BasicBlock *LLVM_BB = BB->getBasicBlock();
11068     MachineFunction::iterator It = ++BB->getIterator();
11069 
11070     //  thisMBB:
11071     //  ...
11072     //   TrueVal = ...
11073     //   cmpTY ccX, r1, r2
11074     //   bCC copy1MBB
11075     //   fallthrough --> copy0MBB
11076     MachineBasicBlock *thisMBB  = BB;
11077     MachineFunction *F = BB->getParent();
11078     MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
11079     MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
11080     F->insert(It, copy0MBB);
11081     F->insert(It, sinkMBB);
11082 
11083     // Check whether CPSR is live past the tMOVCCr_pseudo.
11084     const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
11085     if (!MI.killsRegister(ARM::CPSR) &&
11086         !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) {
11087       copy0MBB->addLiveIn(ARM::CPSR);
11088       sinkMBB->addLiveIn(ARM::CPSR);
11089     }
11090 
11091     // Transfer the remainder of BB and its successor edges to sinkMBB.
11092     sinkMBB->splice(sinkMBB->begin(), BB,
11093                     std::next(MachineBasicBlock::iterator(MI)), BB->end());
11094     sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
11095 
11096     BB->addSuccessor(copy0MBB);
11097     BB->addSuccessor(sinkMBB);
11098 
11099     BuildMI(BB, dl, TII->get(ARM::tBcc))
11100         .addMBB(sinkMBB)
11101         .addImm(MI.getOperand(3).getImm())
11102         .addReg(MI.getOperand(4).getReg());
11103 
11104     //  copy0MBB:
11105     //   %FalseValue = ...
11106     //   # fallthrough to sinkMBB
11107     BB = copy0MBB;
11108 
11109     // Update machine-CFG edges
11110     BB->addSuccessor(sinkMBB);
11111 
11112     //  sinkMBB:
11113     //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
11114     //  ...
11115     BB = sinkMBB;
11116     BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
11117         .addReg(MI.getOperand(1).getReg())
11118         .addMBB(copy0MBB)
11119         .addReg(MI.getOperand(2).getReg())
11120         .addMBB(thisMBB);
11121 
11122     MI.eraseFromParent(); // The pseudo instruction is gone now.
11123     return BB;
11124   }
11125 
11126   case ARM::BCCi64:
11127   case ARM::BCCZi64: {
11128     // If there is an unconditional branch to the other successor, remove it.
11129     BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
11130 
11131     // Compare both parts that make up the double comparison separately for
11132     // equality.
11133     bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
11134 
11135     Register LHS1 = MI.getOperand(1).getReg();
11136     Register LHS2 = MI.getOperand(2).getReg();
11137     if (RHSisZero) {
11138       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
11139           .addReg(LHS1)
11140           .addImm(0)
11141           .add(predOps(ARMCC::AL));
11142       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
11143         .addReg(LHS2).addImm(0)
11144         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
11145     } else {
11146       Register RHS1 = MI.getOperand(3).getReg();
11147       Register RHS2 = MI.getOperand(4).getReg();
11148       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
11149           .addReg(LHS1)
11150           .addReg(RHS1)
11151           .add(predOps(ARMCC::AL));
11152       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
11153         .addReg(LHS2).addReg(RHS2)
11154         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
11155     }
11156 
11157     MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
11158     MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
11159     if (MI.getOperand(0).getImm() == ARMCC::NE)
11160       std::swap(destMBB, exitMBB);
11161 
11162     BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
11163       .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
11164     if (isThumb2)
11165       BuildMI(BB, dl, TII->get(ARM::t2B))
11166           .addMBB(exitMBB)
11167           .add(predOps(ARMCC::AL));
11168     else
11169       BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
11170 
11171     MI.eraseFromParent(); // The pseudo instruction is gone now.
11172     return BB;
11173   }
11174 
11175   case ARM::Int_eh_sjlj_setjmp:
11176   case ARM::Int_eh_sjlj_setjmp_nofp:
11177   case ARM::tInt_eh_sjlj_setjmp:
11178   case ARM::t2Int_eh_sjlj_setjmp:
11179   case ARM::t2Int_eh_sjlj_setjmp_nofp:
11180     return BB;
11181 
11182   case ARM::Int_eh_sjlj_setup_dispatch:
11183     EmitSjLjDispatchBlock(MI, BB);
11184     return BB;
11185 
11186   case ARM::ABS:
11187   case ARM::t2ABS: {
11188     // To insert an ABS instruction, we have to insert the
11189     // diamond control-flow pattern.  The incoming instruction knows the
11190     // source vreg to test against 0, the destination vreg to set,
11191     // the condition code register to branch on, the
11192     // true/false values to select between, and a branch opcode to use.
11193     // It transforms
11194     //     V1 = ABS V0
11195     // into
11196     //     V2 = MOVS V0
11197     //     BCC                      (branch to SinkBB if V0 >= 0)
11198     //     RSBBB: V3 = RSBri V2, 0  (compute ABS if V2 < 0)
11199     //     SinkBB: V1 = PHI(V2, V3)
11200     const BasicBlock *LLVM_BB = BB->getBasicBlock();
11201     MachineFunction::iterator BBI = ++BB->getIterator();
11202     MachineFunction *Fn = BB->getParent();
11203     MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
11204     MachineBasicBlock *SinkBB  = Fn->CreateMachineBasicBlock(LLVM_BB);
11205     Fn->insert(BBI, RSBBB);
11206     Fn->insert(BBI, SinkBB);
11207 
11208     Register ABSSrcReg = MI.getOperand(1).getReg();
11209     Register ABSDstReg = MI.getOperand(0).getReg();
11210     bool ABSSrcKIll = MI.getOperand(1).isKill();
11211     bool isThumb2 = Subtarget->isThumb2();
11212     MachineRegisterInfo &MRI = Fn->getRegInfo();
11213     // In Thumb mode S must not be specified if source register is the SP or
11214     // PC and if destination register is the SP, so restrict register class
11215     Register NewRsbDstReg = MRI.createVirtualRegister(
11216         isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
11217 
11218     // Transfer the remainder of BB and its successor edges to sinkMBB.
11219     SinkBB->splice(SinkBB->begin(), BB,
11220                    std::next(MachineBasicBlock::iterator(MI)), BB->end());
11221     SinkBB->transferSuccessorsAndUpdatePHIs(BB);
11222 
11223     BB->addSuccessor(RSBBB);
11224     BB->addSuccessor(SinkBB);
11225 
11226     // fall through to SinkMBB
11227     RSBBB->addSuccessor(SinkBB);
11228 
11229     // insert a cmp at the end of BB
11230     BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
11231         .addReg(ABSSrcReg)
11232         .addImm(0)
11233         .add(predOps(ARMCC::AL));
11234 
11235     // insert a bcc with opposite CC to ARMCC::MI at the end of BB
11236     BuildMI(BB, dl,
11237       TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
11238       .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR);
11239 
11240     // insert rsbri in RSBBB
11241     // Note: BCC and rsbri will be converted into predicated rsbmi
11242     // by if-conversion pass
11243     BuildMI(*RSBBB, RSBBB->begin(), dl,
11244             TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
11245         .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0)
11246         .addImm(0)
11247         .add(predOps(ARMCC::AL))
11248         .add(condCodeOp());
11249 
11250     // insert PHI in SinkBB,
11251     // reuse ABSDstReg to not change uses of ABS instruction
11252     BuildMI(*SinkBB, SinkBB->begin(), dl,
11253       TII->get(ARM::PHI), ABSDstReg)
11254       .addReg(NewRsbDstReg).addMBB(RSBBB)
11255       .addReg(ABSSrcReg).addMBB(BB);
11256 
11257     // remove ABS instruction
11258     MI.eraseFromParent();
11259 
11260     // return last added BB
11261     return SinkBB;
11262   }
11263   case ARM::COPY_STRUCT_BYVAL_I32:
11264     ++NumLoopByVals;
11265     return EmitStructByval(MI, BB);
11266   case ARM::WIN__CHKSTK:
11267     return EmitLowered__chkstk(MI, BB);
11268   case ARM::WIN__DBZCHK:
11269     return EmitLowered__dbzchk(MI, BB);
11270   case ARM::t2DoLoopStart:
11271     // We are just here to set a register allocation hint, prefering lr for the
11272     // input register to make it more likely to be movable and removable, later
11273     // in the pipeline.
11274     Register R = MI.getOperand(1).getReg();
11275     MachineFunction *MF = MI.getParent()->getParent();
11276     MF->getRegInfo().setRegAllocationHint(R, ARMRI::RegLR, 0);
11277     return BB;
11278   }
11279 }
11280 
11281 /// Attaches vregs to MEMCPY that it will use as scratch registers
11282 /// when it is expanded into LDM/STM. This is done as a post-isel lowering
11283 /// instead of as a custom inserter because we need the use list from the SDNode.
11284 static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
11285                                     MachineInstr &MI, const SDNode *Node) {
11286   bool isThumb1 = Subtarget->isThumb1Only();
11287 
11288   DebugLoc DL = MI.getDebugLoc();
11289   MachineFunction *MF = MI.getParent()->getParent();
11290   MachineRegisterInfo &MRI = MF->getRegInfo();
11291   MachineInstrBuilder MIB(*MF, MI);
11292 
11293   // If the new dst/src is unused mark it as dead.
11294   if (!Node->hasAnyUseOfValue(0)) {
11295     MI.getOperand(0).setIsDead(true);
11296   }
11297   if (!Node->hasAnyUseOfValue(1)) {
11298     MI.getOperand(1).setIsDead(true);
11299   }
11300 
11301   // The MEMCPY both defines and kills the scratch registers.
11302   for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
11303     Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
11304                                                          : &ARM::GPRRegClass);
11305     MIB.addReg(TmpReg, RegState::Define|RegState::Dead);
11306   }
11307 }
11308 
11309 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
11310                                                       SDNode *Node) const {
11311   if (MI.getOpcode() == ARM::MEMCPY) {
11312     attachMEMCPYScratchRegs(Subtarget, MI, Node);
11313     return;
11314   }
11315 
11316   const MCInstrDesc *MCID = &MI.getDesc();
11317   // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
11318   // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
11319   // operand is still set to noreg. If needed, set the optional operand's
11320   // register to CPSR, and remove the redundant implicit def.
11321   //
11322   // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).
11323 
11324   // Rename pseudo opcodes.
11325   unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
11326   unsigned ccOutIdx;
11327   if (NewOpc) {
11328     const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
11329     MCID = &TII->get(NewOpc);
11330 
11331     assert(MCID->getNumOperands() ==
11332            MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
11333         && "converted opcode should be the same except for cc_out"
11334            " (and, on Thumb1, pred)");
11335 
11336     MI.setDesc(*MCID);
11337 
11338     // Add the optional cc_out operand
11339     MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
11340 
11341     // On Thumb1, move all input operands to the end, then add the predicate
11342     if (Subtarget->isThumb1Only()) {
11343       for (unsigned c = MCID->getNumOperands() - 4; c--;) {
11344         MI.addOperand(MI.getOperand(1));
11345         MI.RemoveOperand(1);
11346       }
11347 
11348       // Restore the ties
11349       for (unsigned i = MI.getNumOperands(); i--;) {
11350         const MachineOperand& op = MI.getOperand(i);
11351         if (op.isReg() && op.isUse()) {
11352           int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
11353           if (DefIdx != -1)
11354             MI.tieOperands(DefIdx, i);
11355         }
11356       }
11357 
11358       MI.addOperand(MachineOperand::CreateImm(ARMCC::AL));
11359       MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
11360       ccOutIdx = 1;
11361     } else
11362       ccOutIdx = MCID->getNumOperands() - 1;
11363   } else
11364     ccOutIdx = MCID->getNumOperands() - 1;
11365 
11366   // Any ARM instruction that sets the 's' bit should specify an optional
11367   // "cc_out" operand in the last operand position.
11368   if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
11369     assert(!NewOpc && "Optional cc_out operand required");
11370     return;
11371   }
11372   // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
11373   // since we already have an optional CPSR def.
11374   bool definesCPSR = false;
11375   bool deadCPSR = false;
11376   for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
11377        ++i) {
11378     const MachineOperand &MO = MI.getOperand(i);
11379     if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
11380       definesCPSR = true;
11381       if (MO.isDead())
11382         deadCPSR = true;
11383       MI.RemoveOperand(i);
11384       break;
11385     }
11386   }
11387   if (!definesCPSR) {
11388     assert(!NewOpc && "Optional cc_out operand required");
11389     return;
11390   }
11391   assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
11392   if (deadCPSR) {
11393     assert(!MI.getOperand(ccOutIdx).getReg() &&
11394            "expect uninitialized optional cc_out operand");
11395     // Thumb1 instructions must have the S bit even if the CPSR is dead.
11396     if (!Subtarget->isThumb1Only())
11397       return;
11398   }
11399 
11400   // If this instruction was defined with an optional CPSR def and its dag node
11401   // had a live implicit CPSR def, then activate the optional CPSR def.
11402   MachineOperand &MO = MI.getOperand(ccOutIdx);
11403   MO.setReg(ARM::CPSR);
11404   MO.setIsDef(true);
11405 }
11406 
11407 //===----------------------------------------------------------------------===//
11408 //                           ARM Optimization Hooks
11409 //===----------------------------------------------------------------------===//
11410 
11411 // Helper function that checks if N is a null or all ones constant.
11412 static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
11413   return AllOnes ? isAllOnesConstant(N) : isNullConstant(N);
11414 }
11415 
11416 // Return true if N is conditionally 0 or all ones.
11417 // Detects these expressions where cc is an i1 value:
11418 //
11419 //   (select cc 0, y)   [AllOnes=0]
11420 //   (select cc y, 0)   [AllOnes=0]
11421 //   (zext cc)          [AllOnes=0]
11422 //   (sext cc)          [AllOnes=0/1]
11423 //   (select cc -1, y)  [AllOnes=1]
11424 //   (select cc y, -1)  [AllOnes=1]
11425 //
11426 // Invert is set when N is the null/all ones constant when CC is false.
11427 // OtherOp is set to the alternative value of N.
11428 static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
11429                                        SDValue &CC, bool &Invert,
11430                                        SDValue &OtherOp,
11431                                        SelectionDAG &DAG) {
11432   switch (N->getOpcode()) {
11433   default: return false;
11434   case ISD::SELECT: {
11435     CC = N->getOperand(0);
11436     SDValue N1 = N->getOperand(1);
11437     SDValue N2 = N->getOperand(2);
11438     if (isZeroOrAllOnes(N1, AllOnes)) {
11439       Invert = false;
11440       OtherOp = N2;
11441       return true;
11442     }
11443     if (isZeroOrAllOnes(N2, AllOnes)) {
11444       Invert = true;
11445       OtherOp = N1;
11446       return true;
11447     }
11448     return false;
11449   }
11450   case ISD::ZERO_EXTEND:
11451     // (zext cc) can never be the all ones value.
11452     if (AllOnes)
11453       return false;
11454     LLVM_FALLTHROUGH;
11455   case ISD::SIGN_EXTEND: {
11456     SDLoc dl(N);
11457     EVT VT = N->getValueType(0);
11458     CC = N->getOperand(0);
11459     if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
11460       return false;
11461     Invert = !AllOnes;
11462     if (AllOnes)
11463       // When looking for an AllOnes constant, N is an sext, and the 'other'
11464       // value is 0.
11465       OtherOp = DAG.getConstant(0, dl, VT);
11466     else if (N->getOpcode() == ISD::ZERO_EXTEND)
11467       // When looking for a 0 constant, N can be zext or sext.
11468       OtherOp = DAG.getConstant(1, dl, VT);
11469     else
11470       OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl,
11471                                 VT);
11472     return true;
11473   }
11474   }
11475 }
11476 
11477 // Combine a constant select operand into its use:
11478 //
11479 //   (add (select cc, 0, c), x)  -> (select cc, x, (add, x, c))
11480 //   (sub x, (select cc, 0, c))  -> (select cc, x, (sub, x, c))
11481 //   (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))  [AllOnes=1]
11482 //   (or  (select cc, 0, c), x)  -> (select cc, x, (or, x, c))
11483 //   (xor (select cc, 0, c), x)  -> (select cc, x, (xor, x, c))
11484 //
11485 // The transform is rejected if the select doesn't have a constant operand that
11486 // is null, or all ones when AllOnes is set.
11487 //
11488 // Also recognize sext/zext from i1:
11489 //
11490 //   (add (zext cc), x) -> (select cc (add x, 1), x)
11491 //   (add (sext cc), x) -> (select cc (add x, -1), x)
11492 //
11493 // These transformations eventually create predicated instructions.
11494 //
11495 // @param N       The node to transform.
11496 // @param Slct    The N operand that is a select.
11497 // @param OtherOp The other N operand (x above).
11498 // @param DCI     Context.
11499 // @param AllOnes Require the select constant to be all ones instead of null.
11500 // @returns The new node, or SDValue() on failure.
11501 static
11502 SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
11503                             TargetLowering::DAGCombinerInfo &DCI,
11504                             bool AllOnes = false) {
11505   SelectionDAG &DAG = DCI.DAG;
11506   EVT VT = N->getValueType(0);
11507   SDValue NonConstantVal;
11508   SDValue CCOp;
11509   bool SwapSelectOps;
11510   if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
11511                                   NonConstantVal, DAG))
11512     return SDValue();
11513 
11514   // Slct is now know to be the desired identity constant when CC is true.
11515   SDValue TrueVal = OtherOp;
11516   SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
11517                                  OtherOp, NonConstantVal);
11518   // Unless SwapSelectOps says CC should be false.
11519   if (SwapSelectOps)
11520     std::swap(TrueVal, FalseVal);
11521 
11522   return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
11523                      CCOp, TrueVal, FalseVal);
11524 }
11525 
11526 // Attempt combineSelectAndUse on each operand of a commutative operator N.
11527 static
11528 SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,
11529                                        TargetLowering::DAGCombinerInfo &DCI) {
11530   SDValue N0 = N->getOperand(0);
11531   SDValue N1 = N->getOperand(1);
11532   if (N0.getNode()->hasOneUse())
11533     if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
11534       return Result;
11535   if (N1.getNode()->hasOneUse())
11536     if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
11537       return Result;
11538   return SDValue();
11539 }
11540 
11541 static bool IsVUZPShuffleNode(SDNode *N) {
11542   // VUZP shuffle node.
11543   if (N->getOpcode() == ARMISD::VUZP)
11544     return true;
11545 
11546   // "VUZP" on i32 is an alias for VTRN.
11547   if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
11548     return true;
11549 
11550   return false;
11551 }
11552 
11553 static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1,
11554                                  TargetLowering::DAGCombinerInfo &DCI,
11555                                  const ARMSubtarget *Subtarget) {
11556   // Look for ADD(VUZP.0, VUZP.1).
11557   if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
11558       N0 == N1)
11559    return SDValue();
11560 
11561   // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
11562   if (!N->getValueType(0).is64BitVector())
11563     return SDValue();
11564 
11565   // Generate vpadd.
11566   SelectionDAG &DAG = DCI.DAG;
11567   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11568   SDLoc dl(N);
11569   SDNode *Unzip = N0.getNode();
11570   EVT VT = N->getValueType(0);
11571 
11572   SmallVector<SDValue, 8> Ops;
11573   Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
11574                                 TLI.getPointerTy(DAG.getDataLayout())));
11575   Ops.push_back(Unzip->getOperand(0));
11576   Ops.push_back(Unzip->getOperand(1));
11577 
11578   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
11579 }
11580 
11581 static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1,
11582                                       TargetLowering::DAGCombinerInfo &DCI,
11583                                       const ARMSubtarget *Subtarget) {
11584   // Check for two extended operands.
11585   if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
11586         N1.getOpcode() == ISD::SIGN_EXTEND) &&
11587       !(N0.getOpcode() == ISD::ZERO_EXTEND &&
11588         N1.getOpcode() == ISD::ZERO_EXTEND))
11589     return SDValue();
11590 
11591   SDValue N00 = N0.getOperand(0);
11592   SDValue N10 = N1.getOperand(0);
11593 
11594   // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
11595   if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
11596       N00 == N10)
11597     return SDValue();
11598 
11599   // We only recognize Q register paddl here; this can't be reached until
11600   // after type legalization.
11601   if (!N00.getValueType().is64BitVector() ||
11602       !N0.getValueType().is128BitVector())
11603     return SDValue();
11604 
11605   // Generate vpaddl.
11606   SelectionDAG &DAG = DCI.DAG;
11607   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11608   SDLoc dl(N);
11609   EVT VT = N->getValueType(0);
11610 
11611   SmallVector<SDValue, 8> Ops;
11612   // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
11613   unsigned Opcode;
11614   if (N0.getOpcode() == ISD::SIGN_EXTEND)
11615     Opcode = Intrinsic::arm_neon_vpaddls;
11616   else
11617     Opcode = Intrinsic::arm_neon_vpaddlu;
11618   Ops.push_back(DAG.getConstant(Opcode, dl,
11619                                 TLI.getPointerTy(DAG.getDataLayout())));
11620   EVT ElemTy = N00.getValueType().getVectorElementType();
11621   unsigned NumElts = VT.getVectorNumElements();
11622   EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
11623   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
11624                                N00.getOperand(0), N00.getOperand(1));
11625   Ops.push_back(Concat);
11626 
11627   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
11628 }
11629 
11630 // FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
11631 // an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
11632 // much easier to match.
11633 static SDValue
11634 AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1,
11635                                TargetLowering::DAGCombinerInfo &DCI,
11636                                const ARMSubtarget *Subtarget) {
11637   // Only perform optimization if after legalize, and if NEON is available. We
11638   // also expected both operands to be BUILD_VECTORs.
11639   if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
11640       || N0.getOpcode() != ISD::BUILD_VECTOR
11641       || N1.getOpcode() != ISD::BUILD_VECTOR)
11642     return SDValue();
11643 
11644   // Check output type since VPADDL operand elements can only be 8, 16, or 32.
11645   EVT VT = N->getValueType(0);
11646   if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
11647     return SDValue();
11648 
11649   // Check that the vector operands are of the right form.
11650   // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
11651   // operands, where N is the size of the formed vector.
11652   // Each EXTRACT_VECTOR should have the same input vector and odd or even
11653   // index such that we have a pair wise add pattern.
11654 
11655   // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
11656   if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11657     return SDValue();
11658   SDValue Vec = N0->getOperand(0)->getOperand(0);
11659   SDNode *V = Vec.getNode();
11660   unsigned nextIndex = 0;
11661 
11662   // For each operands to the ADD which are BUILD_VECTORs,
11663   // check to see if each of their operands are an EXTRACT_VECTOR with
11664   // the same vector and appropriate index.
11665   for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
11666     if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT
11667         && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
11668 
11669       SDValue ExtVec0 = N0->getOperand(i);
11670       SDValue ExtVec1 = N1->getOperand(i);
11671 
11672       // First operand is the vector, verify its the same.
11673       if (V != ExtVec0->getOperand(0).getNode() ||
11674           V != ExtVec1->getOperand(0).getNode())
11675         return SDValue();
11676 
11677       // Second is the constant, verify its correct.
11678       ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1));
11679       ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1));
11680 
11681       // For the constant, we want to see all the even or all the odd.
11682       if (!C0 || !C1 || C0->getZExtValue() != nextIndex
11683           || C1->getZExtValue() != nextIndex+1)
11684         return SDValue();
11685 
11686       // Increment index.
11687       nextIndex+=2;
11688     } else
11689       return SDValue();
11690   }
11691 
11692   // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
11693   // we're using the entire input vector, otherwise there's a size/legality
11694   // mismatch somewhere.
11695   if (nextIndex != Vec.getValueType().getVectorNumElements() ||
11696       Vec.getValueType().getVectorElementType() == VT.getVectorElementType())
11697     return SDValue();
11698 
11699   // Create VPADDL node.
11700   SelectionDAG &DAG = DCI.DAG;
11701   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11702 
11703   SDLoc dl(N);
11704 
11705   // Build operand list.
11706   SmallVector<SDValue, 8> Ops;
11707   Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
11708                                 TLI.getPointerTy(DAG.getDataLayout())));
11709 
11710   // Input is the vector.
11711   Ops.push_back(Vec);
11712 
11713   // Get widened type and narrowed type.
11714   MVT widenType;
11715   unsigned numElem = VT.getVectorNumElements();
11716 
11717   EVT inputLaneType = Vec.getValueType().getVectorElementType();
11718   switch (inputLaneType.getSimpleVT().SimpleTy) {
11719     case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
11720     case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
11721     case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
11722     default:
11723       llvm_unreachable("Invalid vector element type for padd optimization.");
11724   }
11725 
11726   SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
11727   unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
11728   return DAG.getNode(ExtOp, dl, VT, tmp);
11729 }
11730 
11731 static SDValue findMUL_LOHI(SDValue V) {
11732   if (V->getOpcode() == ISD::UMUL_LOHI ||
11733       V->getOpcode() == ISD::SMUL_LOHI)
11734     return V;
11735   return SDValue();
11736 }
11737 
11738 static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
11739                                         TargetLowering::DAGCombinerInfo &DCI,
11740                                         const ARMSubtarget *Subtarget) {
11741   if (!Subtarget->hasBaseDSP())
11742     return SDValue();
11743 
11744   // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
11745   // accumulates the product into a 64-bit value. The 16-bit values will
11746   // be sign extended somehow or SRA'd into 32-bit values
11747   // (addc (adde (mul 16bit, 16bit), lo), hi)
11748   SDValue Mul = AddcNode->getOperand(0);
11749   SDValue Lo = AddcNode->getOperand(1);
11750   if (Mul.getOpcode() != ISD::MUL) {
11751     Lo = AddcNode->getOperand(0);
11752     Mul = AddcNode->getOperand(1);
11753     if (Mul.getOpcode() != ISD::MUL)
11754       return SDValue();
11755   }
11756 
11757   SDValue SRA = AddeNode->getOperand(0);
11758   SDValue Hi = AddeNode->getOperand(1);
11759   if (SRA.getOpcode() != ISD::SRA) {
11760     SRA = AddeNode->getOperand(1);
11761     Hi = AddeNode->getOperand(0);
11762     if (SRA.getOpcode() != ISD::SRA)
11763       return SDValue();
11764   }
11765   if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
11766     if (Const->getZExtValue() != 31)
11767       return SDValue();
11768   } else
11769     return SDValue();
11770 
11771   if (SRA.getOperand(0) != Mul)
11772     return SDValue();
11773 
11774   SelectionDAG &DAG = DCI.DAG;
11775   SDLoc dl(AddcNode);
11776   unsigned Opcode = 0;
11777   SDValue Op0;
11778   SDValue Op1;
11779 
11780   if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
11781     Opcode = ARMISD::SMLALBB;
11782     Op0 = Mul.getOperand(0);
11783     Op1 = Mul.getOperand(1);
11784   } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
11785     Opcode = ARMISD::SMLALBT;
11786     Op0 = Mul.getOperand(0);
11787     Op1 = Mul.getOperand(1).getOperand(0);
11788   } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
11789     Opcode = ARMISD::SMLALTB;
11790     Op0 = Mul.getOperand(0).getOperand(0);
11791     Op1 = Mul.getOperand(1);
11792   } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
11793     Opcode = ARMISD::SMLALTT;
11794     Op0 = Mul->getOperand(0).getOperand(0);
11795     Op1 = Mul->getOperand(1).getOperand(0);
11796   }
11797 
11798   if (!Op0 || !Op1)
11799     return SDValue();
11800 
11801   SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
11802                               Op0, Op1, Lo, Hi);
11803   // Replace the ADDs' nodes uses by the MLA node's values.
11804   SDValue HiMLALResult(SMLAL.getNode(), 1);
11805   SDValue LoMLALResult(SMLAL.getNode(), 0);
11806 
11807   DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
11808   DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
11809 
11810   // Return original node to notify the driver to stop replacing.
11811   SDValue resNode(AddcNode, 0);
11812   return resNode;
11813 }
11814 
11815 static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode,
11816                                      TargetLowering::DAGCombinerInfo &DCI,
11817                                      const ARMSubtarget *Subtarget) {
11818   // Look for multiply add opportunities.
11819   // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
11820   // each add nodes consumes a value from ISD::UMUL_LOHI and there is
11821   // a glue link from the first add to the second add.
11822   // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
11823   // a S/UMLAL instruction.
11824   //                  UMUL_LOHI
11825   //                 / :lo    \ :hi
11826   //                V          \          [no multiline comment]
11827   //    loAdd ->  ADDC         |
11828   //                 \ :carry /
11829   //                  V      V
11830   //                    ADDE   <- hiAdd
11831   //
11832   // In the special case where only the higher part of a signed result is used
11833   // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
11834   // a constant with the exact value of 0x80000000, we recognize we are dealing
11835   // with a "rounded multiply and add" (or subtract) and transform it into
11836   // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
11837 
11838   assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
11839           AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
11840          "Expect an ADDE or SUBE");
11841 
11842   assert(AddeSubeNode->getNumOperands() == 3 &&
11843          AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
11844          "ADDE node has the wrong inputs");
11845 
11846   // Check that we are chained to the right ADDC or SUBC node.
11847   SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
11848   if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
11849        AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
11850       (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
11851        AddcSubcNode->getOpcode() != ARMISD::SUBC))
11852     return SDValue();
11853 
11854   SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
11855   SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
11856 
11857   // Check if the two operands are from the same mul_lohi node.
11858   if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
11859     return SDValue();
11860 
11861   assert(AddcSubcNode->getNumValues() == 2 &&
11862          AddcSubcNode->getValueType(0) == MVT::i32 &&
11863          "Expect ADDC with two result values. First: i32");
11864 
11865   // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
11866   // maybe a SMLAL which multiplies two 16-bit values.
11867   if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
11868       AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
11869       AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
11870       AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
11871       AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
11872     return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);
11873 
11874   // Check for the triangle shape.
11875   SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
11876   SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
11877 
11878   // Make sure that the ADDE/SUBE operands are not coming from the same node.
11879   if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
11880     return SDValue();
11881 
11882   // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
11883   bool IsLeftOperandMUL = false;
11884   SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
11885   if (MULOp == SDValue())
11886     MULOp = findMUL_LOHI(AddeSubeOp1);
11887   else
11888     IsLeftOperandMUL = true;
11889   if (MULOp == SDValue())
11890     return SDValue();
11891 
11892   // Figure out the right opcode.
11893   unsigned Opc = MULOp->getOpcode();
11894   unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
11895 
11896   // Figure out the high and low input values to the MLAL node.
11897   SDValue *HiAddSub = nullptr;
11898   SDValue *LoMul = nullptr;
11899   SDValue *LowAddSub = nullptr;
11900 
11901   // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
11902   if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
11903     return SDValue();
11904 
11905   if (IsLeftOperandMUL)
11906     HiAddSub = &AddeSubeOp1;
11907   else
11908     HiAddSub = &AddeSubeOp0;
11909 
11910   // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
11911   // whose low result is fed to the ADDC/SUBC we are checking.
11912 
11913   if (AddcSubcOp0 == MULOp.getValue(0)) {
11914     LoMul = &AddcSubcOp0;
11915     LowAddSub = &AddcSubcOp1;
11916   }
11917   if (AddcSubcOp1 == MULOp.getValue(0)) {
11918     LoMul = &AddcSubcOp1;
11919     LowAddSub = &AddcSubcOp0;
11920   }
11921 
11922   if (!LoMul)
11923     return SDValue();
11924 
11925   // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
11926   // the replacement below will create a cycle.
11927   if (AddcSubcNode == HiAddSub->getNode() ||
11928       AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
11929     return SDValue();
11930 
11931   // Create the merged node.
11932   SelectionDAG &DAG = DCI.DAG;
11933 
11934   // Start building operand list.
11935   SmallVector<SDValue, 8> Ops;
11936   Ops.push_back(LoMul->getOperand(0));
11937   Ops.push_back(LoMul->getOperand(1));
11938 
11939   // Check whether we can use SMMLAR, SMMLSR or SMMULR instead.  For this to be
11940   // the case, we must be doing signed multiplication and only use the higher
11941   // part of the result of the MLAL, furthermore the LowAddSub must be a constant
11942   // addition or subtraction with the value of 0x800000.
11943   if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
11944       FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
11945       LowAddSub->getNode()->getOpcode() == ISD::Constant &&
11946       static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
11947           0x80000000) {
11948     Ops.push_back(*HiAddSub);
11949     if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
11950       FinalOpc = ARMISD::SMMLSR;
11951     } else {
11952       FinalOpc = ARMISD::SMMLAR;
11953     }
11954     SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
11955     DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);
11956 
11957     return SDValue(AddeSubeNode, 0);
11958   } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
11959     // SMMLS is generated during instruction selection and the rest of this
11960     // function can not handle the case where AddcSubcNode is a SUBC.
11961     return SDValue();
11962 
11963   // Finish building the operand list for {U/S}MLAL
11964   Ops.push_back(*LowAddSub);
11965   Ops.push_back(*HiAddSub);
11966 
11967   SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
11968                                  DAG.getVTList(MVT::i32, MVT::i32), Ops);
11969 
11970   // Replace the ADDs' nodes uses by the MLA node's values.
11971   SDValue HiMLALResult(MLALNode.getNode(), 1);
11972   DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);
11973 
11974   SDValue LoMLALResult(MLALNode.getNode(), 0);
11975   DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);
11976 
11977   // Return original node to notify the driver to stop replacing.
11978   return SDValue(AddeSubeNode, 0);
11979 }
11980 
11981 static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode,
11982                                       TargetLowering::DAGCombinerInfo &DCI,
11983                                       const ARMSubtarget *Subtarget) {
11984   // UMAAL is similar to UMLAL except that it adds two unsigned values.
11985   // While trying to combine for the other MLAL nodes, first search for the
11986   // chance to use UMAAL. Check if Addc uses a node which has already
11987   // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
11988   // as the addend, and it's handled in PerformUMLALCombine.
11989 
11990   if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
11991     return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
11992 
11993   // Check that we have a glued ADDC node.
11994   SDNode* AddcNode = AddeNode->getOperand(2).getNode();
11995   if (AddcNode->getOpcode() != ARMISD::ADDC)
11996     return SDValue();
11997 
11998   // Find the converted UMAAL or quit if it doesn't exist.
11999   SDNode *UmlalNode = nullptr;
12000   SDValue AddHi;
12001   if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
12002     UmlalNode = AddcNode->getOperand(0).getNode();
12003     AddHi = AddcNode->getOperand(1);
12004   } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
12005     UmlalNode = AddcNode->getOperand(1).getNode();
12006     AddHi = AddcNode->getOperand(0);
12007   } else {
12008     return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
12009   }
12010 
12011   // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
12012   // the ADDC as well as Zero.
12013   if (!isNullConstant(UmlalNode->getOperand(3)))
12014     return SDValue();
12015 
12016   if ((isNullConstant(AddeNode->getOperand(0)) &&
12017        AddeNode->getOperand(1).getNode() == UmlalNode) ||
12018       (AddeNode->getOperand(0).getNode() == UmlalNode &&
12019        isNullConstant(AddeNode->getOperand(1)))) {
12020     SelectionDAG &DAG = DCI.DAG;
12021     SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
12022                       UmlalNode->getOperand(2), AddHi };
12023     SDValue UMAAL =  DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
12024                                  DAG.getVTList(MVT::i32, MVT::i32), Ops);
12025 
12026     // Replace the ADDs' nodes uses by the UMAAL node's values.
12027     DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
12028     DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
12029 
12030     // Return original node to notify the driver to stop replacing.
12031     return SDValue(AddeNode, 0);
12032   }
12033   return SDValue();
12034 }
12035 
12036 static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG,
12037                                    const ARMSubtarget *Subtarget) {
12038   if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
12039     return SDValue();
12040 
12041   // Check that we have a pair of ADDC and ADDE as operands.
12042   // Both addends of the ADDE must be zero.
12043   SDNode* AddcNode = N->getOperand(2).getNode();
12044   SDNode* AddeNode = N->getOperand(3).getNode();
12045   if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
12046       (AddeNode->getOpcode() == ARMISD::ADDE) &&
12047       isNullConstant(AddeNode->getOperand(0)) &&
12048       isNullConstant(AddeNode->getOperand(1)) &&
12049       (AddeNode->getOperand(2).getNode() == AddcNode))
12050     return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
12051                        DAG.getVTList(MVT::i32, MVT::i32),
12052                        {N->getOperand(0), N->getOperand(1),
12053                         AddcNode->getOperand(0), AddcNode->getOperand(1)});
12054   else
12055     return SDValue();
12056 }
12057 
12058 static SDValue PerformAddcSubcCombine(SDNode *N,
12059                                       TargetLowering::DAGCombinerInfo &DCI,
12060                                       const ARMSubtarget *Subtarget) {
12061   SelectionDAG &DAG(DCI.DAG);
12062 
12063   if (N->getOpcode() == ARMISD::SUBC) {
12064     // (SUBC (ADDE 0, 0, C), 1) -> C
12065     SDValue LHS = N->getOperand(0);
12066     SDValue RHS = N->getOperand(1);
12067     if (LHS->getOpcode() == ARMISD::ADDE &&
12068         isNullConstant(LHS->getOperand(0)) &&
12069         isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
12070       return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
12071     }
12072   }
12073 
12074   if (Subtarget->isThumb1Only()) {
12075     SDValue RHS = N->getOperand(1);
12076     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
12077       int32_t imm = C->getSExtValue();
12078       if (imm < 0 && imm > std::numeric_limits<int>::min()) {
12079         SDLoc DL(N);
12080         RHS = DAG.getConstant(-imm, DL, MVT::i32);
12081         unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
12082                                                            : ARMISD::ADDC;
12083         return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
12084       }
12085     }
12086   }
12087 
12088   return SDValue();
12089 }
12090 
12091 static SDValue PerformAddeSubeCombine(SDNode *N,
12092                                       TargetLowering::DAGCombinerInfo &DCI,
12093                                       const ARMSubtarget *Subtarget) {
12094   if (Subtarget->isThumb1Only()) {
12095     SelectionDAG &DAG = DCI.DAG;
12096     SDValue RHS = N->getOperand(1);
12097     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
12098       int64_t imm = C->getSExtValue();
12099       if (imm < 0) {
12100         SDLoc DL(N);
12101 
12102         // The with-carry-in form matches bitwise not instead of the negation.
12103         // Effectively, the inverse interpretation of the carry flag already
12104         // accounts for part of the negation.
12105         RHS = DAG.getConstant(~imm, DL, MVT::i32);
12106 
12107         unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
12108                                                            : ARMISD::ADDE;
12109         return DAG.getNode(Opcode, DL, N->getVTList(),
12110                            N->getOperand(0), RHS, N->getOperand(2));
12111       }
12112     }
12113   } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
12114     return AddCombineTo64bitMLAL(N, DCI, Subtarget);
12115   }
12116   return SDValue();
12117 }
12118 
12119 static SDValue PerformSELECTCombine(SDNode *N,
12120                                     TargetLowering::DAGCombinerInfo &DCI,
12121                                     const ARMSubtarget *Subtarget) {
12122   if (!Subtarget->hasMVEIntegerOps())
12123     return SDValue();
12124 
12125   SDLoc dl(N);
12126   SDValue SetCC;
12127   SDValue LHS;
12128   SDValue RHS;
12129   ISD::CondCode CC;
12130   SDValue TrueVal;
12131   SDValue FalseVal;
12132 
12133   if (N->getOpcode() == ISD::SELECT &&
12134       N->getOperand(0)->getOpcode() == ISD::SETCC) {
12135     SetCC = N->getOperand(0);
12136     LHS = SetCC->getOperand(0);
12137     RHS = SetCC->getOperand(1);
12138     CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
12139     TrueVal = N->getOperand(1);
12140     FalseVal = N->getOperand(2);
12141   } else if (N->getOpcode() == ISD::SELECT_CC) {
12142     LHS = N->getOperand(0);
12143     RHS = N->getOperand(1);
12144     CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
12145     TrueVal = N->getOperand(2);
12146     FalseVal = N->getOperand(3);
12147   } else {
12148     return SDValue();
12149   }
12150 
12151   unsigned int Opcode = 0;
12152   if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN ||
12153        FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) &&
12154       (CC == ISD::SETULT || CC == ISD::SETUGT)) {
12155     Opcode = ARMISD::VMINVu;
12156     if (CC == ISD::SETUGT)
12157       std::swap(TrueVal, FalseVal);
12158   } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN ||
12159               FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) &&
12160              (CC == ISD::SETLT || CC == ISD::SETGT)) {
12161     Opcode = ARMISD::VMINVs;
12162     if (CC == ISD::SETGT)
12163       std::swap(TrueVal, FalseVal);
12164   } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX ||
12165               FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) &&
12166              (CC == ISD::SETUGT || CC == ISD::SETULT)) {
12167     Opcode = ARMISD::VMAXVu;
12168     if (CC == ISD::SETULT)
12169       std::swap(TrueVal, FalseVal);
12170   } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX ||
12171               FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) &&
12172              (CC == ISD::SETGT || CC == ISD::SETLT)) {
12173     Opcode = ARMISD::VMAXVs;
12174     if (CC == ISD::SETLT)
12175       std::swap(TrueVal, FalseVal);
12176   } else
12177     return SDValue();
12178 
12179   // Normalise to the right hand side being the vector reduction
12180   switch (TrueVal->getOpcode()) {
12181   case ISD::VECREDUCE_UMIN:
12182   case ISD::VECREDUCE_SMIN:
12183   case ISD::VECREDUCE_UMAX:
12184   case ISD::VECREDUCE_SMAX:
12185     std::swap(LHS, RHS);
12186     std::swap(TrueVal, FalseVal);
12187     break;
12188   }
12189 
12190   EVT VectorType = FalseVal->getOperand(0).getValueType();
12191 
12192   if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 &&
12193       VectorType != MVT::v4i32)
12194     return SDValue();
12195 
12196   EVT VectorScalarType = VectorType.getVectorElementType();
12197 
12198   // The values being selected must also be the ones being compared
12199   if (TrueVal != LHS || FalseVal != RHS)
12200     return SDValue();
12201 
12202   EVT LeftType = LHS->getValueType(0);
12203   EVT RightType = RHS->getValueType(0);
12204 
12205   // The types must match the reduced type too
12206   if (LeftType != VectorScalarType || RightType != VectorScalarType)
12207     return SDValue();
12208 
12209   // Legalise the scalar to an i32
12210   if (VectorScalarType != MVT::i32)
12211     LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
12212 
12213   // Generate the reduction as an i32 for legalisation purposes
12214   auto Reduction =
12215       DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0));
12216 
12217   // The result isn't actually an i32 so truncate it back to its original type
12218   if (VectorScalarType != MVT::i32)
12219     Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction);
12220 
12221   return Reduction;
12222 }
12223 
12224 // A special combine for the vqdmulh family of instructions. This is one of the
12225 // potential set of patterns that could patch this instruction. The base pattern
12226 // you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))).
12227 // This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))),
12228 // which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as
12229 // the max is unnecessary.
12230 static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG) {
12231   EVT VT = N->getValueType(0);
12232   SDValue Shft;
12233   ConstantSDNode *Clamp;
12234 
12235   if (N->getOpcode() == ISD::SMIN) {
12236     Shft = N->getOperand(0);
12237     Clamp = isConstOrConstSplat(N->getOperand(1));
12238   } else if (N->getOpcode() == ISD::VSELECT) {
12239     // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin.
12240     SDValue Cmp = N->getOperand(0);
12241     if (Cmp.getOpcode() != ISD::SETCC ||
12242         cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT ||
12243         Cmp.getOperand(0) != N->getOperand(1) ||
12244         Cmp.getOperand(1) != N->getOperand(2))
12245       return SDValue();
12246     Shft = N->getOperand(1);
12247     Clamp = isConstOrConstSplat(N->getOperand(2));
12248   } else
12249     return SDValue();
12250 
12251   if (!Clamp)
12252     return SDValue();
12253 
12254   MVT ScalarType;
12255   int ShftAmt = 0;
12256   switch (Clamp->getSExtValue()) {
12257   case (1 << 7) - 1:
12258     ScalarType = MVT::i8;
12259     ShftAmt = 7;
12260     break;
12261   case (1 << 15) - 1:
12262     ScalarType = MVT::i16;
12263     ShftAmt = 15;
12264     break;
12265   case (1ULL << 31) - 1:
12266     ScalarType = MVT::i32;
12267     ShftAmt = 31;
12268     break;
12269   default:
12270     return SDValue();
12271   }
12272 
12273   if (Shft.getOpcode() != ISD::SRA)
12274     return SDValue();
12275   ConstantSDNode *N1 = isConstOrConstSplat(Shft.getOperand(1));
12276   if (!N1 || N1->getSExtValue() != ShftAmt)
12277     return SDValue();
12278 
12279   SDValue Mul = Shft.getOperand(0);
12280   if (Mul.getOpcode() != ISD::MUL)
12281     return SDValue();
12282 
12283   SDValue Ext0 = Mul.getOperand(0);
12284   SDValue Ext1 = Mul.getOperand(1);
12285   if (Ext0.getOpcode() != ISD::SIGN_EXTEND ||
12286       Ext1.getOpcode() != ISD::SIGN_EXTEND)
12287     return SDValue();
12288   EVT VecVT = Ext0.getOperand(0).getValueType();
12289   if (VecVT != MVT::v4i32 && VecVT != MVT::v8i16 && VecVT != MVT::v16i8)
12290     return SDValue();
12291   if (Ext1.getOperand(0).getValueType() != VecVT ||
12292       VecVT.getScalarType() != ScalarType ||
12293       VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2)
12294     return SDValue();
12295 
12296   SDLoc DL(Mul);
12297   SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, VecVT, Ext0.getOperand(0),
12298                                 Ext1.getOperand(0));
12299   return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, VQDMULH);
12300 }
12301 
12302 static SDValue PerformVSELECTCombine(SDNode *N,
12303                                      TargetLowering::DAGCombinerInfo &DCI,
12304                                      const ARMSubtarget *Subtarget) {
12305   if (!Subtarget->hasMVEIntegerOps())
12306     return SDValue();
12307 
12308   if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG))
12309     return V;
12310 
12311   // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs).
12312   //
12313   // We need to re-implement this optimization here as the implementation in the
12314   // Target-Independent DAGCombiner does not handle the kind of constant we make
12315   // (it calls isConstOrConstSplat with AllowTruncation set to false - and for
12316   // good reason, allowing truncation there would break other targets).
12317   //
12318   // Currently, this is only done for MVE, as it's the only target that benefits
12319   // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL).
12320   if (N->getOperand(0).getOpcode() != ISD::XOR)
12321     return SDValue();
12322   SDValue XOR = N->getOperand(0);
12323 
12324   // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s.
12325   // It is important to check with truncation allowed as the BUILD_VECTORs we
12326   // generate in those situations will truncate their operands.
12327   ConstantSDNode *Const =
12328       isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false,
12329                           /*AllowTruncation*/ true);
12330   if (!Const || !Const->isOne())
12331     return SDValue();
12332 
12333   // Rewrite into vselect(cond, rhs, lhs).
12334   SDValue Cond = XOR->getOperand(0);
12335   SDValue LHS = N->getOperand(1);
12336   SDValue RHS = N->getOperand(2);
12337   EVT Type = N->getValueType(0);
12338   return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS);
12339 }
12340 
12341 static SDValue PerformABSCombine(SDNode *N,
12342                                   TargetLowering::DAGCombinerInfo &DCI,
12343                                   const ARMSubtarget *Subtarget) {
12344   SDValue res;
12345   SelectionDAG &DAG = DCI.DAG;
12346   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12347 
12348   if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0)))
12349     return SDValue();
12350 
12351   if (!TLI.expandABS(N, res, DAG))
12352       return SDValue();
12353 
12354   return res;
12355 }
12356 
12357 /// PerformADDECombine - Target-specific dag combine transform from
12358 /// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
12359 /// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
12360 static SDValue PerformADDECombine(SDNode *N,
12361                                   TargetLowering::DAGCombinerInfo &DCI,
12362                                   const ARMSubtarget *Subtarget) {
12363   // Only ARM and Thumb2 support UMLAL/SMLAL.
12364   if (Subtarget->isThumb1Only())
12365     return PerformAddeSubeCombine(N, DCI, Subtarget);
12366 
12367   // Only perform the checks after legalize when the pattern is available.
12368   if (DCI.isBeforeLegalize()) return SDValue();
12369 
12370   return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
12371 }
12372 
12373 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
12374 /// operands N0 and N1.  This is a helper for PerformADDCombine that is
12375 /// called with the default operands, and if that fails, with commuted
12376 /// operands.
12377 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
12378                                           TargetLowering::DAGCombinerInfo &DCI,
12379                                           const ARMSubtarget *Subtarget){
12380   // Attempt to create vpadd for this add.
12381   if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
12382     return Result;
12383 
12384   // Attempt to create vpaddl for this add.
12385   if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
12386     return Result;
12387   if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
12388                                                       Subtarget))
12389     return Result;
12390 
12391   // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
12392   if (N0.getNode()->hasOneUse())
12393     if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
12394       return Result;
12395   return SDValue();
12396 }
12397 
12398 static SDValue PerformADDVecReduce(SDNode *N,
12399                                    TargetLowering::DAGCombinerInfo &DCI,
12400                                    const ARMSubtarget *Subtarget) {
12401   if (!Subtarget->hasMVEIntegerOps() || N->getValueType(0) != MVT::i64)
12402     return SDValue();
12403 
12404   SDValue N0 = N->getOperand(0);
12405   SDValue N1 = N->getOperand(1);
12406 
12407   // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this
12408   // will look like:
12409   //   t1: i32,i32 = ARMISD::VADDLVs x
12410   //   t2: i64 = build_pair t1, t1:1
12411   //   t3: i64 = add t2, y
12412   // We also need to check for sext / zext and commutitive adds.
12413   auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA,
12414                            SDValue NB) {
12415     if (NB->getOpcode() != ISD::BUILD_PAIR)
12416       return SDValue();
12417     SDValue VecRed = NB->getOperand(0);
12418     if (VecRed->getOpcode() != Opcode || VecRed.getResNo() != 0 ||
12419         NB->getOperand(1) != SDValue(VecRed.getNode(), 1))
12420       return SDValue();
12421 
12422     SDLoc dl(N);
12423     SmallVector<SDValue, 4> Ops;
12424     Ops.push_back(DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA,
12425                                   DCI.DAG.getConstant(0, dl, MVT::i32)));
12426     Ops.push_back(DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA,
12427                                   DCI.DAG.getConstant(1, dl, MVT::i32)));
12428     for (unsigned i = 0, e = VecRed.getNumOperands(); i < e; i++)
12429       Ops.push_back(VecRed->getOperand(i));
12430     SDValue Red = DCI.DAG.getNode(OpcodeA, dl,
12431                                   DCI.DAG.getVTList({MVT::i32, MVT::i32}), Ops);
12432     return DCI.DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red,
12433                            SDValue(Red.getNode(), 1));
12434   };
12435 
12436   if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1))
12437     return M;
12438   if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1))
12439     return M;
12440   if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0))
12441     return M;
12442   if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0))
12443     return M;
12444   if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1))
12445     return M;
12446   if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1))
12447     return M;
12448   if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0))
12449     return M;
12450   if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0))
12451     return M;
12452   if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1))
12453     return M;
12454   if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1))
12455     return M;
12456   if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0))
12457     return M;
12458   if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0))
12459     return M;
12460   if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1))
12461     return M;
12462   if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1))
12463     return M;
12464   if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0))
12465     return M;
12466   if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0))
12467     return M;
12468   return SDValue();
12469 }
12470 
12471 bool
12472 ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
12473                                                  CombineLevel Level) const {
12474   if (Level == BeforeLegalizeTypes)
12475     return true;
12476 
12477   if (N->getOpcode() != ISD::SHL)
12478     return true;
12479 
12480   if (Subtarget->isThumb1Only()) {
12481     // Avoid making expensive immediates by commuting shifts. (This logic
12482     // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
12483     // for free.)
12484     if (N->getOpcode() != ISD::SHL)
12485       return true;
12486     SDValue N1 = N->getOperand(0);
12487     if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&
12488         N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)
12489       return true;
12490     if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) {
12491       if (Const->getAPIntValue().ult(256))
12492         return false;
12493       if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) &&
12494           Const->getAPIntValue().sgt(-256))
12495         return false;
12496     }
12497     return true;
12498   }
12499 
12500   // Turn off commute-with-shift transform after legalization, so it doesn't
12501   // conflict with PerformSHLSimplify.  (We could try to detect when
12502   // PerformSHLSimplify would trigger more precisely, but it isn't
12503   // really necessary.)
12504   return false;
12505 }
12506 
12507 bool ARMTargetLowering::shouldFoldConstantShiftPairToMask(
12508     const SDNode *N, CombineLevel Level) const {
12509   if (!Subtarget->isThumb1Only())
12510     return true;
12511 
12512   if (Level == BeforeLegalizeTypes)
12513     return true;
12514 
12515   return false;
12516 }
12517 
12518 bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
12519   if (!Subtarget->hasNEON()) {
12520     if (Subtarget->isThumb1Only())
12521       return VT.getScalarSizeInBits() <= 32;
12522     return true;
12523   }
12524   return VT.isScalarInteger();
12525 }
12526 
12527 static SDValue PerformSHLSimplify(SDNode *N,
12528                                 TargetLowering::DAGCombinerInfo &DCI,
12529                                 const ARMSubtarget *ST) {
12530   // Allow the generic combiner to identify potential bswaps.
12531   if (DCI.isBeforeLegalize())
12532     return SDValue();
12533 
12534   // DAG combiner will fold:
12535   // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
12536   // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
12537   // Other code patterns that can be also be modified have the following form:
12538   // b + ((a << 1) | 510)
12539   // b + ((a << 1) & 510)
12540   // b + ((a << 1) ^ 510)
12541   // b + ((a << 1) + 510)
12542 
12543   // Many instructions can  perform the shift for free, but it requires both
12544   // the operands to be registers. If c1 << c2 is too large, a mov immediate
12545   // instruction will needed. So, unfold back to the original pattern if:
12546   // - if c1 and c2 are small enough that they don't require mov imms.
12547   // - the user(s) of the node can perform an shl
12548 
12549   // No shifted operands for 16-bit instructions.
12550   if (ST->isThumb() && ST->isThumb1Only())
12551     return SDValue();
12552 
12553   // Check that all the users could perform the shl themselves.
12554   for (auto U : N->uses()) {
12555     switch(U->getOpcode()) {
12556     default:
12557       return SDValue();
12558     case ISD::SUB:
12559     case ISD::ADD:
12560     case ISD::AND:
12561     case ISD::OR:
12562     case ISD::XOR:
12563     case ISD::SETCC:
12564     case ARMISD::CMP:
12565       // Check that the user isn't already using a constant because there
12566       // aren't any instructions that support an immediate operand and a
12567       // shifted operand.
12568       if (isa<ConstantSDNode>(U->getOperand(0)) ||
12569           isa<ConstantSDNode>(U->getOperand(1)))
12570         return SDValue();
12571 
12572       // Check that it's not already using a shift.
12573       if (U->getOperand(0).getOpcode() == ISD::SHL ||
12574           U->getOperand(1).getOpcode() == ISD::SHL)
12575         return SDValue();
12576       break;
12577     }
12578   }
12579 
12580   if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
12581       N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
12582     return SDValue();
12583 
12584   if (N->getOperand(0).getOpcode() != ISD::SHL)
12585     return SDValue();
12586 
12587   SDValue SHL = N->getOperand(0);
12588 
12589   auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
12590   auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
12591   if (!C1ShlC2 || !C2)
12592     return SDValue();
12593 
12594   APInt C2Int = C2->getAPIntValue();
12595   APInt C1Int = C1ShlC2->getAPIntValue();
12596 
12597   // Check that performing a lshr will not lose any information.
12598   APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(),
12599                                      C2Int.getBitWidth() - C2->getZExtValue());
12600   if ((C1Int & Mask) != C1Int)
12601     return SDValue();
12602 
12603   // Shift the first constant.
12604   C1Int.lshrInPlace(C2Int);
12605 
12606   // The immediates are encoded as an 8-bit value that can be rotated.
12607   auto LargeImm = [](const APInt &Imm) {
12608     unsigned Zeros = Imm.countLeadingZeros() + Imm.countTrailingZeros();
12609     return Imm.getBitWidth() - Zeros > 8;
12610   };
12611 
12612   if (LargeImm(C1Int) || LargeImm(C2Int))
12613     return SDValue();
12614 
12615   SelectionDAG &DAG = DCI.DAG;
12616   SDLoc dl(N);
12617   SDValue X = SHL.getOperand(0);
12618   SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
12619                               DAG.getConstant(C1Int, dl, MVT::i32));
12620   // Shift left to compensate for the lshr of C1Int.
12621   SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
12622 
12623   LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
12624              SHL.dump(); N->dump());
12625   LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
12626   return Res;
12627 }
12628 
12629 
12630 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
12631 ///
12632 static SDValue PerformADDCombine(SDNode *N,
12633                                  TargetLowering::DAGCombinerInfo &DCI,
12634                                  const ARMSubtarget *Subtarget) {
12635   SDValue N0 = N->getOperand(0);
12636   SDValue N1 = N->getOperand(1);
12637 
12638   // Only works one way, because it needs an immediate operand.
12639   if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
12640     return Result;
12641 
12642   if (SDValue Result = PerformADDVecReduce(N, DCI, Subtarget))
12643     return Result;
12644 
12645   // First try with the default operand order.
12646   if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
12647     return Result;
12648 
12649   // If that didn't work, try again with the operands commuted.
12650   return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
12651 }
12652 
12653 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
12654 ///
12655 static SDValue PerformSUBCombine(SDNode *N,
12656                                  TargetLowering::DAGCombinerInfo &DCI,
12657                                  const ARMSubtarget *Subtarget) {
12658   SDValue N0 = N->getOperand(0);
12659   SDValue N1 = N->getOperand(1);
12660 
12661   // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
12662   if (N1.getNode()->hasOneUse())
12663     if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
12664       return Result;
12665 
12666   if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector())
12667     return SDValue();
12668 
12669   // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x))
12670   // so that we can readily pattern match more mve instructions which can use
12671   // a scalar operand.
12672   SDValue VDup = N->getOperand(1);
12673   if (VDup->getOpcode() != ARMISD::VDUP)
12674     return SDValue();
12675 
12676   SDValue VMov = N->getOperand(0);
12677   if (VMov->getOpcode() == ISD::BITCAST)
12678     VMov = VMov->getOperand(0);
12679 
12680   if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov))
12681     return SDValue();
12682 
12683   SDLoc dl(N);
12684   SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32,
12685                                    DCI.DAG.getConstant(0, dl, MVT::i32),
12686                                    VDup->getOperand(0));
12687   return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate);
12688 }
12689 
12690 /// PerformVMULCombine
12691 /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
12692 /// special multiplier accumulator forwarding.
12693 ///   vmul d3, d0, d2
12694 ///   vmla d3, d1, d2
12695 /// is faster than
12696 ///   vadd d3, d0, d1
12697 ///   vmul d3, d3, d2
12698 //  However, for (A + B) * (A + B),
12699 //    vadd d2, d0, d1
12700 //    vmul d3, d0, d2
12701 //    vmla d3, d1, d2
12702 //  is slower than
12703 //    vadd d2, d0, d1
12704 //    vmul d3, d2, d2
12705 static SDValue PerformVMULCombine(SDNode *N,
12706                                   TargetLowering::DAGCombinerInfo &DCI,
12707                                   const ARMSubtarget *Subtarget) {
12708   if (!Subtarget->hasVMLxForwarding())
12709     return SDValue();
12710 
12711   SelectionDAG &DAG = DCI.DAG;
12712   SDValue N0 = N->getOperand(0);
12713   SDValue N1 = N->getOperand(1);
12714   unsigned Opcode = N0.getOpcode();
12715   if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
12716       Opcode != ISD::FADD && Opcode != ISD::FSUB) {
12717     Opcode = N1.getOpcode();
12718     if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
12719         Opcode != ISD::FADD && Opcode != ISD::FSUB)
12720       return SDValue();
12721     std::swap(N0, N1);
12722   }
12723 
12724   if (N0 == N1)
12725     return SDValue();
12726 
12727   EVT VT = N->getValueType(0);
12728   SDLoc DL(N);
12729   SDValue N00 = N0->getOperand(0);
12730   SDValue N01 = N0->getOperand(1);
12731   return DAG.getNode(Opcode, DL, VT,
12732                      DAG.getNode(ISD::MUL, DL, VT, N00, N1),
12733                      DAG.getNode(ISD::MUL, DL, VT, N01, N1));
12734 }
12735 
12736 static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG,
12737                                       const ARMSubtarget *Subtarget) {
12738   EVT VT = N->getValueType(0);
12739   if (VT != MVT::v2i64)
12740     return SDValue();
12741 
12742   SDValue N0 = N->getOperand(0);
12743   SDValue N1 = N->getOperand(1);
12744 
12745   auto IsSignExt = [&](SDValue Op) {
12746     if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG)
12747       return SDValue();
12748     EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT();
12749     if (VT.getScalarSizeInBits() == 32)
12750       return Op->getOperand(0);
12751     return SDValue();
12752   };
12753   auto IsZeroExt = [&](SDValue Op) {
12754     // Zero extends are a little more awkward. At the point we are matching
12755     // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask.
12756     // That might be before of after a bitcast depending on how the and is
12757     // placed. Because this has to look through bitcasts, it is currently only
12758     // supported on LE.
12759     if (!Subtarget->isLittle())
12760       return SDValue();
12761 
12762     SDValue And = Op;
12763     if (And->getOpcode() == ISD::BITCAST)
12764       And = And->getOperand(0);
12765     if (And->getOpcode() != ISD::AND)
12766       return SDValue();
12767     SDValue Mask = And->getOperand(1);
12768     if (Mask->getOpcode() == ISD::BITCAST)
12769       Mask = Mask->getOperand(0);
12770 
12771     if (Mask->getOpcode() != ISD::BUILD_VECTOR ||
12772         Mask.getValueType() != MVT::v4i32)
12773       return SDValue();
12774     if (isAllOnesConstant(Mask->getOperand(0)) &&
12775         isNullConstant(Mask->getOperand(1)) &&
12776         isAllOnesConstant(Mask->getOperand(2)) &&
12777         isNullConstant(Mask->getOperand(3)))
12778       return And->getOperand(0);
12779     return SDValue();
12780   };
12781 
12782   SDLoc dl(N);
12783   if (SDValue Op0 = IsSignExt(N0)) {
12784     if (SDValue Op1 = IsSignExt(N1)) {
12785       SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
12786       SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
12787       return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a);
12788     }
12789   }
12790   if (SDValue Op0 = IsZeroExt(N0)) {
12791     if (SDValue Op1 = IsZeroExt(N1)) {
12792       SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
12793       SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
12794       return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a);
12795     }
12796   }
12797 
12798   return SDValue();
12799 }
12800 
12801 static SDValue PerformMULCombine(SDNode *N,
12802                                  TargetLowering::DAGCombinerInfo &DCI,
12803                                  const ARMSubtarget *Subtarget) {
12804   SelectionDAG &DAG = DCI.DAG;
12805 
12806   EVT VT = N->getValueType(0);
12807   if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64)
12808     return PerformMVEVMULLCombine(N, DAG, Subtarget);
12809 
12810   if (Subtarget->isThumb1Only())
12811     return SDValue();
12812 
12813   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
12814     return SDValue();
12815 
12816   if (VT.is64BitVector() || VT.is128BitVector())
12817     return PerformVMULCombine(N, DCI, Subtarget);
12818   if (VT != MVT::i32)
12819     return SDValue();
12820 
12821   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
12822   if (!C)
12823     return SDValue();
12824 
12825   int64_t MulAmt = C->getSExtValue();
12826   unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt);
12827 
12828   ShiftAmt = ShiftAmt & (32 - 1);
12829   SDValue V = N->getOperand(0);
12830   SDLoc DL(N);
12831 
12832   SDValue Res;
12833   MulAmt >>= ShiftAmt;
12834 
12835   if (MulAmt >= 0) {
12836     if (isPowerOf2_32(MulAmt - 1)) {
12837       // (mul x, 2^N + 1) => (add (shl x, N), x)
12838       Res = DAG.getNode(ISD::ADD, DL, VT,
12839                         V,
12840                         DAG.getNode(ISD::SHL, DL, VT,
12841                                     V,
12842                                     DAG.getConstant(Log2_32(MulAmt - 1), DL,
12843                                                     MVT::i32)));
12844     } else if (isPowerOf2_32(MulAmt + 1)) {
12845       // (mul x, 2^N - 1) => (sub (shl x, N), x)
12846       Res = DAG.getNode(ISD::SUB, DL, VT,
12847                         DAG.getNode(ISD::SHL, DL, VT,
12848                                     V,
12849                                     DAG.getConstant(Log2_32(MulAmt + 1), DL,
12850                                                     MVT::i32)),
12851                         V);
12852     } else
12853       return SDValue();
12854   } else {
12855     uint64_t MulAmtAbs = -MulAmt;
12856     if (isPowerOf2_32(MulAmtAbs + 1)) {
12857       // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
12858       Res = DAG.getNode(ISD::SUB, DL, VT,
12859                         V,
12860                         DAG.getNode(ISD::SHL, DL, VT,
12861                                     V,
12862                                     DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
12863                                                     MVT::i32)));
12864     } else if (isPowerOf2_32(MulAmtAbs - 1)) {
12865       // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
12866       Res = DAG.getNode(ISD::ADD, DL, VT,
12867                         V,
12868                         DAG.getNode(ISD::SHL, DL, VT,
12869                                     V,
12870                                     DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
12871                                                     MVT::i32)));
12872       Res = DAG.getNode(ISD::SUB, DL, VT,
12873                         DAG.getConstant(0, DL, MVT::i32), Res);
12874     } else
12875       return SDValue();
12876   }
12877 
12878   if (ShiftAmt != 0)
12879     Res = DAG.getNode(ISD::SHL, DL, VT,
12880                       Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
12881 
12882   // Do not add new nodes to DAG combiner worklist.
12883   DCI.CombineTo(N, Res, false);
12884   return SDValue();
12885 }
12886 
12887 static SDValue CombineANDShift(SDNode *N,
12888                                TargetLowering::DAGCombinerInfo &DCI,
12889                                const ARMSubtarget *Subtarget) {
12890   // Allow DAGCombine to pattern-match before we touch the canonical form.
12891   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
12892     return SDValue();
12893 
12894   if (N->getValueType(0) != MVT::i32)
12895     return SDValue();
12896 
12897   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
12898   if (!N1C)
12899     return SDValue();
12900 
12901   uint32_t C1 = (uint32_t)N1C->getZExtValue();
12902   // Don't transform uxtb/uxth.
12903   if (C1 == 255 || C1 == 65535)
12904     return SDValue();
12905 
12906   SDNode *N0 = N->getOperand(0).getNode();
12907   if (!N0->hasOneUse())
12908     return SDValue();
12909 
12910   if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
12911     return SDValue();
12912 
12913   bool LeftShift = N0->getOpcode() == ISD::SHL;
12914 
12915   ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1));
12916   if (!N01C)
12917     return SDValue();
12918 
12919   uint32_t C2 = (uint32_t)N01C->getZExtValue();
12920   if (!C2 || C2 >= 32)
12921     return SDValue();
12922 
12923   // Clear irrelevant bits in the mask.
12924   if (LeftShift)
12925     C1 &= (-1U << C2);
12926   else
12927     C1 &= (-1U >> C2);
12928 
12929   SelectionDAG &DAG = DCI.DAG;
12930   SDLoc DL(N);
12931 
12932   // We have a pattern of the form "(and (shl x, c2) c1)" or
12933   // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
12934   // transform to a pair of shifts, to save materializing c1.
12935 
12936   // First pattern: right shift, then mask off leading bits.
12937   // FIXME: Use demanded bits?
12938   if (!LeftShift && isMask_32(C1)) {
12939     uint32_t C3 = countLeadingZeros(C1);
12940     if (C2 < C3) {
12941       SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
12942                                 DAG.getConstant(C3 - C2, DL, MVT::i32));
12943       return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
12944                          DAG.getConstant(C3, DL, MVT::i32));
12945     }
12946   }
12947 
12948   // First pattern, reversed: left shift, then mask off trailing bits.
12949   if (LeftShift && isMask_32(~C1)) {
12950     uint32_t C3 = countTrailingZeros(C1);
12951     if (C2 < C3) {
12952       SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
12953                                 DAG.getConstant(C3 - C2, DL, MVT::i32));
12954       return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
12955                          DAG.getConstant(C3, DL, MVT::i32));
12956     }
12957   }
12958 
12959   // Second pattern: left shift, then mask off leading bits.
12960   // FIXME: Use demanded bits?
12961   if (LeftShift && isShiftedMask_32(C1)) {
12962     uint32_t Trailing = countTrailingZeros(C1);
12963     uint32_t C3 = countLeadingZeros(C1);
12964     if (Trailing == C2 && C2 + C3 < 32) {
12965       SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
12966                                 DAG.getConstant(C2 + C3, DL, MVT::i32));
12967       return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
12968                         DAG.getConstant(C3, DL, MVT::i32));
12969     }
12970   }
12971 
12972   // Second pattern, reversed: right shift, then mask off trailing bits.
12973   // FIXME: Handle other patterns of known/demanded bits.
12974   if (!LeftShift && isShiftedMask_32(C1)) {
12975     uint32_t Leading = countLeadingZeros(C1);
12976     uint32_t C3 = countTrailingZeros(C1);
12977     if (Leading == C2 && C2 + C3 < 32) {
12978       SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
12979                                 DAG.getConstant(C2 + C3, DL, MVT::i32));
12980       return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
12981                          DAG.getConstant(C3, DL, MVT::i32));
12982     }
12983   }
12984 
12985   // FIXME: Transform "(and (shl x, c2) c1)" ->
12986   // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than
12987   // c1.
12988   return SDValue();
12989 }
12990 
12991 static SDValue PerformANDCombine(SDNode *N,
12992                                  TargetLowering::DAGCombinerInfo &DCI,
12993                                  const ARMSubtarget *Subtarget) {
12994   // Attempt to use immediate-form VBIC
12995   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
12996   SDLoc dl(N);
12997   EVT VT = N->getValueType(0);
12998   SelectionDAG &DAG = DCI.DAG;
12999 
13000   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v4i1 ||
13001       VT == MVT::v8i1 || VT == MVT::v16i1)
13002     return SDValue();
13003 
13004   APInt SplatBits, SplatUndef;
13005   unsigned SplatBitSize;
13006   bool HasAnyUndefs;
13007   if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
13008       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
13009     if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
13010         SplatBitSize == 64) {
13011       EVT VbicVT;
13012       SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
13013                                       SplatUndef.getZExtValue(), SplatBitSize,
13014                                       DAG, dl, VbicVT, VT, OtherModImm);
13015       if (Val.getNode()) {
13016         SDValue Input =
13017           DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
13018         SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
13019         return DAG.getNode(ISD::BITCAST, dl, VT, Vbic);
13020       }
13021     }
13022   }
13023 
13024   if (!Subtarget->isThumb1Only()) {
13025     // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
13026     if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
13027       return Result;
13028 
13029     if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
13030       return Result;
13031   }
13032 
13033   if (Subtarget->isThumb1Only())
13034     if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
13035       return Result;
13036 
13037   return SDValue();
13038 }
13039 
13040 // Try combining OR nodes to SMULWB, SMULWT.
13041 static SDValue PerformORCombineToSMULWBT(SDNode *OR,
13042                                          TargetLowering::DAGCombinerInfo &DCI,
13043                                          const ARMSubtarget *Subtarget) {
13044   if (!Subtarget->hasV6Ops() ||
13045       (Subtarget->isThumb() &&
13046        (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
13047     return SDValue();
13048 
13049   SDValue SRL = OR->getOperand(0);
13050   SDValue SHL = OR->getOperand(1);
13051 
13052   if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
13053     SRL = OR->getOperand(1);
13054     SHL = OR->getOperand(0);
13055   }
13056   if (!isSRL16(SRL) || !isSHL16(SHL))
13057     return SDValue();
13058 
13059   // The first operands to the shifts need to be the two results from the
13060   // same smul_lohi node.
13061   if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
13062        SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
13063     return SDValue();
13064 
13065   SDNode *SMULLOHI = SRL.getOperand(0).getNode();
13066   if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
13067       SHL.getOperand(0) != SDValue(SMULLOHI, 1))
13068     return SDValue();
13069 
13070   // Now we have:
13071   // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
13072   // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
13073   // For SMUWB the 16-bit value will signed extended somehow.
13074   // For SMULWT only the SRA is required.
13075   // Check both sides of SMUL_LOHI
13076   SDValue OpS16 = SMULLOHI->getOperand(0);
13077   SDValue OpS32 = SMULLOHI->getOperand(1);
13078 
13079   SelectionDAG &DAG = DCI.DAG;
13080   if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
13081     OpS16 = OpS32;
13082     OpS32 = SMULLOHI->getOperand(0);
13083   }
13084 
13085   SDLoc dl(OR);
13086   unsigned Opcode = 0;
13087   if (isS16(OpS16, DAG))
13088     Opcode = ARMISD::SMULWB;
13089   else if (isSRA16(OpS16)) {
13090     Opcode = ARMISD::SMULWT;
13091     OpS16 = OpS16->getOperand(0);
13092   }
13093   else
13094     return SDValue();
13095 
13096   SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
13097   DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
13098   return SDValue(OR, 0);
13099 }
13100 
13101 static SDValue PerformORCombineToBFI(SDNode *N,
13102                                      TargetLowering::DAGCombinerInfo &DCI,
13103                                      const ARMSubtarget *Subtarget) {
13104   // BFI is only available on V6T2+
13105   if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
13106     return SDValue();
13107 
13108   EVT VT = N->getValueType(0);
13109   SDValue N0 = N->getOperand(0);
13110   SDValue N1 = N->getOperand(1);
13111   SelectionDAG &DAG = DCI.DAG;
13112   SDLoc DL(N);
13113   // 1) or (and A, mask), val => ARMbfi A, val, mask
13114   //      iff (val & mask) == val
13115   //
13116   // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
13117   //  2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
13118   //          && mask == ~mask2
13119   //  2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
13120   //          && ~mask == mask2
13121   //  (i.e., copy a bitfield value into another bitfield of the same width)
13122 
13123   if (VT != MVT::i32)
13124     return SDValue();
13125 
13126   SDValue N00 = N0.getOperand(0);
13127 
13128   // The value and the mask need to be constants so we can verify this is
13129   // actually a bitfield set. If the mask is 0xffff, we can do better
13130   // via a movt instruction, so don't use BFI in that case.
13131   SDValue MaskOp = N0.getOperand(1);
13132   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp);
13133   if (!MaskC)
13134     return SDValue();
13135   unsigned Mask = MaskC->getZExtValue();
13136   if (Mask == 0xffff)
13137     return SDValue();
13138   SDValue Res;
13139   // Case (1): or (and A, mask), val => ARMbfi A, val, mask
13140   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
13141   if (N1C) {
13142     unsigned Val = N1C->getZExtValue();
13143     if ((Val & ~Mask) != Val)
13144       return SDValue();
13145 
13146     if (ARM::isBitFieldInvertedMask(Mask)) {
13147       Val >>= countTrailingZeros(~Mask);
13148 
13149       Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
13150                         DAG.getConstant(Val, DL, MVT::i32),
13151                         DAG.getConstant(Mask, DL, MVT::i32));
13152 
13153       DCI.CombineTo(N, Res, false);
13154       // Return value from the original node to inform the combiner than N is
13155       // now dead.
13156       return SDValue(N, 0);
13157     }
13158   } else if (N1.getOpcode() == ISD::AND) {
13159     // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
13160     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
13161     if (!N11C)
13162       return SDValue();
13163     unsigned Mask2 = N11C->getZExtValue();
13164 
13165     // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
13166     // as is to match.
13167     if (ARM::isBitFieldInvertedMask(Mask) &&
13168         (Mask == ~Mask2)) {
13169       // The pack halfword instruction works better for masks that fit it,
13170       // so use that when it's available.
13171       if (Subtarget->hasDSP() &&
13172           (Mask == 0xffff || Mask == 0xffff0000))
13173         return SDValue();
13174       // 2a
13175       unsigned amt = countTrailingZeros(Mask2);
13176       Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
13177                         DAG.getConstant(amt, DL, MVT::i32));
13178       Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
13179                         DAG.getConstant(Mask, DL, MVT::i32));
13180       DCI.CombineTo(N, Res, false);
13181       // Return value from the original node to inform the combiner than N is
13182       // now dead.
13183       return SDValue(N, 0);
13184     } else if (ARM::isBitFieldInvertedMask(~Mask) &&
13185                (~Mask == Mask2)) {
13186       // The pack halfword instruction works better for masks that fit it,
13187       // so use that when it's available.
13188       if (Subtarget->hasDSP() &&
13189           (Mask2 == 0xffff || Mask2 == 0xffff0000))
13190         return SDValue();
13191       // 2b
13192       unsigned lsb = countTrailingZeros(Mask);
13193       Res = DAG.getNode(ISD::SRL, DL, VT, N00,
13194                         DAG.getConstant(lsb, DL, MVT::i32));
13195       Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
13196                         DAG.getConstant(Mask2, DL, MVT::i32));
13197       DCI.CombineTo(N, Res, false);
13198       // Return value from the original node to inform the combiner than N is
13199       // now dead.
13200       return SDValue(N, 0);
13201     }
13202   }
13203 
13204   if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
13205       N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
13206       ARM::isBitFieldInvertedMask(~Mask)) {
13207     // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
13208     // where lsb(mask) == #shamt and masked bits of B are known zero.
13209     SDValue ShAmt = N00.getOperand(1);
13210     unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue();
13211     unsigned LSB = countTrailingZeros(Mask);
13212     if (ShAmtC != LSB)
13213       return SDValue();
13214 
13215     Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
13216                       DAG.getConstant(~Mask, DL, MVT::i32));
13217 
13218     DCI.CombineTo(N, Res, false);
13219     // Return value from the original node to inform the combiner than N is
13220     // now dead.
13221     return SDValue(N, 0);
13222   }
13223 
13224   return SDValue();
13225 }
13226 
13227 static bool isValidMVECond(unsigned CC, bool IsFloat) {
13228   switch (CC) {
13229   case ARMCC::EQ:
13230   case ARMCC::NE:
13231   case ARMCC::LE:
13232   case ARMCC::GT:
13233   case ARMCC::GE:
13234   case ARMCC::LT:
13235     return true;
13236   case ARMCC::HS:
13237   case ARMCC::HI:
13238     return !IsFloat;
13239   default:
13240     return false;
13241   };
13242 }
13243 
13244 static ARMCC::CondCodes getVCMPCondCode(SDValue N) {
13245   if (N->getOpcode() == ARMISD::VCMP)
13246     return (ARMCC::CondCodes)N->getConstantOperandVal(2);
13247   else if (N->getOpcode() == ARMISD::VCMPZ)
13248     return (ARMCC::CondCodes)N->getConstantOperandVal(1);
13249   else
13250     llvm_unreachable("Not a VCMP/VCMPZ!");
13251 }
13252 
13253 static bool CanInvertMVEVCMP(SDValue N) {
13254   ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N));
13255   return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint());
13256 }
13257 
13258 static SDValue PerformORCombine_i1(SDNode *N,
13259                                    TargetLowering::DAGCombinerInfo &DCI,
13260                                    const ARMSubtarget *Subtarget) {
13261   // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
13262   // together with predicates
13263   EVT VT = N->getValueType(0);
13264   SDLoc DL(N);
13265   SDValue N0 = N->getOperand(0);
13266   SDValue N1 = N->getOperand(1);
13267 
13268   auto IsFreelyInvertable = [&](SDValue V) {
13269     if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ)
13270       return CanInvertMVEVCMP(V);
13271     return false;
13272   };
13273 
13274   // At least one operand must be freely invertable.
13275   if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1)))
13276     return SDValue();
13277 
13278   SDValue NewN0 = DCI.DAG.getLogicalNOT(DL, N0, VT);
13279   SDValue NewN1 = DCI.DAG.getLogicalNOT(DL, N1, VT);
13280   SDValue And = DCI.DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1);
13281   return DCI.DAG.getLogicalNOT(DL, And, VT);
13282 }
13283 
13284 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR
13285 static SDValue PerformORCombine(SDNode *N,
13286                                 TargetLowering::DAGCombinerInfo &DCI,
13287                                 const ARMSubtarget *Subtarget) {
13288   // Attempt to use immediate-form VORR
13289   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
13290   SDLoc dl(N);
13291   EVT VT = N->getValueType(0);
13292   SelectionDAG &DAG = DCI.DAG;
13293 
13294   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
13295     return SDValue();
13296 
13297   if (Subtarget->hasMVEIntegerOps() &&
13298       (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1))
13299     return PerformORCombine_i1(N, DCI, Subtarget);
13300 
13301   APInt SplatBits, SplatUndef;
13302   unsigned SplatBitSize;
13303   bool HasAnyUndefs;
13304   if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
13305       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
13306     if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
13307         SplatBitSize == 64) {
13308       EVT VorrVT;
13309       SDValue Val =
13310           isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
13311                             SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm);
13312       if (Val.getNode()) {
13313         SDValue Input =
13314           DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
13315         SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
13316         return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
13317       }
13318     }
13319   }
13320 
13321   if (!Subtarget->isThumb1Only()) {
13322     // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
13323     if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
13324       return Result;
13325     if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
13326       return Result;
13327   }
13328 
13329   SDValue N0 = N->getOperand(0);
13330   SDValue N1 = N->getOperand(1);
13331 
13332   // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
13333   if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
13334       DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
13335 
13336     // The code below optimizes (or (and X, Y), Z).
13337     // The AND operand needs to have a single user to make these optimizations
13338     // profitable.
13339     if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
13340       return SDValue();
13341 
13342     APInt SplatUndef;
13343     unsigned SplatBitSize;
13344     bool HasAnyUndefs;
13345 
13346     APInt SplatBits0, SplatBits1;
13347     BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
13348     BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
13349     // Ensure that the second operand of both ands are constants
13350     if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
13351                                       HasAnyUndefs) && !HasAnyUndefs) {
13352         if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
13353                                           HasAnyUndefs) && !HasAnyUndefs) {
13354             // Ensure that the bit width of the constants are the same and that
13355             // the splat arguments are logical inverses as per the pattern we
13356             // are trying to simplify.
13357             if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
13358                 SplatBits0 == ~SplatBits1) {
13359                 // Canonicalize the vector type to make instruction selection
13360                 // simpler.
13361                 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
13362                 SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT,
13363                                              N0->getOperand(1),
13364                                              N0->getOperand(0),
13365                                              N1->getOperand(0));
13366                 return DAG.getNode(ISD::BITCAST, dl, VT, Result);
13367             }
13368         }
13369     }
13370   }
13371 
13372   // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
13373   // reasonable.
13374   if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
13375     if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
13376       return Res;
13377   }
13378 
13379   if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
13380     return Result;
13381 
13382   return SDValue();
13383 }
13384 
13385 static SDValue PerformXORCombine(SDNode *N,
13386                                  TargetLowering::DAGCombinerInfo &DCI,
13387                                  const ARMSubtarget *Subtarget) {
13388   EVT VT = N->getValueType(0);
13389   SelectionDAG &DAG = DCI.DAG;
13390 
13391   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
13392     return SDValue();
13393 
13394   if (!Subtarget->isThumb1Only()) {
13395     // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
13396     if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
13397       return Result;
13398 
13399     if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
13400       return Result;
13401   }
13402 
13403   if (Subtarget->hasMVEIntegerOps()) {
13404     // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition.
13405     SDValue N0 = N->getOperand(0);
13406     SDValue N1 = N->getOperand(1);
13407     const TargetLowering *TLI = Subtarget->getTargetLowering();
13408     if (TLI->isConstTrueVal(N1.getNode()) &&
13409         (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) {
13410       if (CanInvertMVEVCMP(N0)) {
13411         SDLoc DL(N0);
13412         ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N0));
13413 
13414         SmallVector<SDValue, 4> Ops;
13415         Ops.push_back(N0->getOperand(0));
13416         if (N0->getOpcode() == ARMISD::VCMP)
13417           Ops.push_back(N0->getOperand(1));
13418         Ops.push_back(DCI.DAG.getConstant(CC, DL, MVT::i32));
13419         return DCI.DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops);
13420       }
13421     }
13422   }
13423 
13424   return SDValue();
13425 }
13426 
13427 // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
13428 // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
13429 // their position in "to" (Rd).
13430 static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
13431   assert(N->getOpcode() == ARMISD::BFI);
13432 
13433   SDValue From = N->getOperand(1);
13434   ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue();
13435   FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation());
13436 
13437   // If the Base came from a SHR #C, we can deduce that it is really testing bit
13438   // #C in the base of the SHR.
13439   if (From->getOpcode() == ISD::SRL &&
13440       isa<ConstantSDNode>(From->getOperand(1))) {
13441     APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue();
13442     assert(Shift.getLimitedValue() < 32 && "Shift too large!");
13443     FromMask <<= Shift.getLimitedValue(31);
13444     From = From->getOperand(0);
13445   }
13446 
13447   return From;
13448 }
13449 
13450 // If A and B contain one contiguous set of bits, does A | B == A . B?
13451 //
13452 // Neither A nor B must be zero.
13453 static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
13454   unsigned LastActiveBitInA =  A.countTrailingZeros();
13455   unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1;
13456   return LastActiveBitInA - 1 == FirstActiveBitInB;
13457 }
13458 
13459 static SDValue FindBFIToCombineWith(SDNode *N) {
13460   // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with,
13461   // if one exists.
13462   APInt ToMask, FromMask;
13463   SDValue From = ParseBFI(N, ToMask, FromMask);
13464   SDValue To = N->getOperand(0);
13465 
13466   // Now check for a compatible BFI to merge with. We can pass through BFIs that
13467   // aren't compatible, but not if they set the same bit in their destination as
13468   // we do (or that of any BFI we're going to combine with).
13469   SDValue V = To;
13470   APInt CombinedToMask = ToMask;
13471   while (V.getOpcode() == ARMISD::BFI) {
13472     APInt NewToMask, NewFromMask;
13473     SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
13474     if (NewFrom != From) {
13475       // This BFI has a different base. Keep going.
13476       CombinedToMask |= NewToMask;
13477       V = V.getOperand(0);
13478       continue;
13479     }
13480 
13481     // Do the written bits conflict with any we've seen so far?
13482     if ((NewToMask & CombinedToMask).getBoolValue())
13483       // Conflicting bits - bail out because going further is unsafe.
13484       return SDValue();
13485 
13486     // Are the new bits contiguous when combined with the old bits?
13487     if (BitsProperlyConcatenate(ToMask, NewToMask) &&
13488         BitsProperlyConcatenate(FromMask, NewFromMask))
13489       return V;
13490     if (BitsProperlyConcatenate(NewToMask, ToMask) &&
13491         BitsProperlyConcatenate(NewFromMask, FromMask))
13492       return V;
13493 
13494     // We've seen a write to some bits, so track it.
13495     CombinedToMask |= NewToMask;
13496     // Keep going...
13497     V = V.getOperand(0);
13498   }
13499 
13500   return SDValue();
13501 }
13502 
13503 static SDValue PerformBFICombine(SDNode *N,
13504                                  TargetLowering::DAGCombinerInfo &DCI) {
13505   SDValue N1 = N->getOperand(1);
13506   if (N1.getOpcode() == ISD::AND) {
13507     // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
13508     // the bits being cleared by the AND are not demanded by the BFI.
13509     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
13510     if (!N11C)
13511       return SDValue();
13512     unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
13513     unsigned LSB = countTrailingZeros(~InvMask);
13514     unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB;
13515     assert(Width <
13516                static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
13517            "undefined behavior");
13518     unsigned Mask = (1u << Width) - 1;
13519     unsigned Mask2 = N11C->getZExtValue();
13520     if ((Mask & (~Mask2)) == 0)
13521       return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
13522                              N->getOperand(0), N1.getOperand(0),
13523                              N->getOperand(2));
13524   } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
13525     // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes.
13526     // Keep track of any consecutive bits set that all come from the same base
13527     // value. We can combine these together into a single BFI.
13528     SDValue CombineBFI = FindBFIToCombineWith(N);
13529     if (CombineBFI == SDValue())
13530       return SDValue();
13531 
13532     // We've found a BFI.
13533     APInt ToMask1, FromMask1;
13534     SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
13535 
13536     APInt ToMask2, FromMask2;
13537     SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
13538     assert(From1 == From2);
13539     (void)From2;
13540 
13541     // First, unlink CombineBFI.
13542     DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0));
13543     // Then create a new BFI, combining the two together.
13544     APInt NewFromMask = FromMask1 | FromMask2;
13545     APInt NewToMask = ToMask1 | ToMask2;
13546 
13547     EVT VT = N->getValueType(0);
13548     SDLoc dl(N);
13549 
13550     if (NewFromMask[0] == 0)
13551       From1 = DCI.DAG.getNode(
13552         ISD::SRL, dl, VT, From1,
13553         DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT));
13554     return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1,
13555                            DCI.DAG.getConstant(~NewToMask, dl, VT));
13556   }
13557   return SDValue();
13558 }
13559 
13560 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for
13561 /// ARMISD::VMOVRRD.
13562 static SDValue PerformVMOVRRDCombine(SDNode *N,
13563                                      TargetLowering::DAGCombinerInfo &DCI,
13564                                      const ARMSubtarget *Subtarget) {
13565   // vmovrrd(vmovdrr x, y) -> x,y
13566   SDValue InDouble = N->getOperand(0);
13567   if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64())
13568     return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
13569 
13570   // vmovrrd(load f64) -> (load i32), (load i32)
13571   SDNode *InNode = InDouble.getNode();
13572   if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
13573       InNode->getValueType(0) == MVT::f64 &&
13574       InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
13575       !cast<LoadSDNode>(InNode)->isVolatile()) {
13576     // TODO: Should this be done for non-FrameIndex operands?
13577     LoadSDNode *LD = cast<LoadSDNode>(InNode);
13578 
13579     SelectionDAG &DAG = DCI.DAG;
13580     SDLoc DL(LD);
13581     SDValue BasePtr = LD->getBasePtr();
13582     SDValue NewLD1 =
13583         DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
13584                     LD->getAlignment(), LD->getMemOperand()->getFlags());
13585 
13586     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
13587                                     DAG.getConstant(4, DL, MVT::i32));
13588 
13589     SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
13590                                  LD->getPointerInfo().getWithOffset(4),
13591                                  std::min(4U, LD->getAlignment()),
13592                                  LD->getMemOperand()->getFlags());
13593 
13594     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
13595     if (DCI.DAG.getDataLayout().isBigEndian())
13596       std::swap (NewLD1, NewLD2);
13597     SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
13598     return Result;
13599   }
13600 
13601   return SDValue();
13602 }
13603 
13604 /// PerformVMOVDRRCombine - Target-specific dag combine xforms for
13605 /// ARMISD::VMOVDRR.  This is also used for BUILD_VECTORs with 2 operands.
13606 static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
13607   // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
13608   SDValue Op0 = N->getOperand(0);
13609   SDValue Op1 = N->getOperand(1);
13610   if (Op0.getOpcode() == ISD::BITCAST)
13611     Op0 = Op0.getOperand(0);
13612   if (Op1.getOpcode() == ISD::BITCAST)
13613     Op1 = Op1.getOperand(0);
13614   if (Op0.getOpcode() == ARMISD::VMOVRRD &&
13615       Op0.getNode() == Op1.getNode() &&
13616       Op0.getResNo() == 0 && Op1.getResNo() == 1)
13617     return DAG.getNode(ISD::BITCAST, SDLoc(N),
13618                        N->getValueType(0), Op0.getOperand(0));
13619   return SDValue();
13620 }
13621 
13622 static SDValue PerformVMOVhrCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
13623   SDValue Op0 = N->getOperand(0);
13624 
13625   // VMOVhr (VMOVrh (X)) -> X
13626   if (Op0->getOpcode() == ARMISD::VMOVrh)
13627     return Op0->getOperand(0);
13628 
13629   // FullFP16: half values are passed in S-registers, and we don't
13630   // need any of the bitcast and moves:
13631   //
13632   //     t2: f32,ch = CopyFromReg t0, Register:f32 %0
13633   //   t5: i32 = bitcast t2
13634   // t18: f16 = ARMISD::VMOVhr t5
13635   if (Op0->getOpcode() == ISD::BITCAST) {
13636     SDValue Copy = Op0->getOperand(0);
13637     if (Copy.getValueType() == MVT::f32 &&
13638         Copy->getOpcode() == ISD::CopyFromReg) {
13639       SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1)};
13640       SDValue NewCopy =
13641           DCI.DAG.getNode(ISD::CopyFromReg, SDLoc(N), N->getValueType(0), Ops);
13642       return NewCopy;
13643     }
13644   }
13645 
13646   // fold (VMOVhr (load x)) -> (load (f16*)x)
13647   if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) {
13648     if (LN0->hasOneUse() && LN0->isUnindexed() &&
13649         LN0->getMemoryVT() == MVT::i16) {
13650       SDValue Load =
13651           DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(),
13652                           LN0->getBasePtr(), LN0->getMemOperand());
13653       DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
13654       DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1));
13655       return Load;
13656     }
13657   }
13658 
13659   // Only the bottom 16 bits of the source register are used.
13660   APInt DemandedMask = APInt::getLowBitsSet(32, 16);
13661   const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
13662   if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI))
13663     return SDValue(N, 0);
13664 
13665   return SDValue();
13666 }
13667 
13668 static SDValue PerformVMOVrhCombine(SDNode *N,
13669                                     TargetLowering::DAGCombinerInfo &DCI) {
13670   SDValue N0 = N->getOperand(0);
13671   EVT VT = N->getValueType(0);
13672 
13673   // fold (VMOVrh (fpconst x)) -> const x
13674   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N0)) {
13675     APFloat V = C->getValueAPF();
13676     return DCI.DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT);
13677   }
13678 
13679   // fold (VMOVrh (load x)) -> (zextload (i16*)x)
13680   if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
13681     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
13682 
13683     SDValue Load =
13684         DCI.DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(),
13685                            LN0->getBasePtr(), MVT::i16, LN0->getMemOperand());
13686     DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
13687     DCI.DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
13688     return Load;
13689   }
13690 
13691   // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n)
13692   if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
13693       isa<ConstantSDNode>(N0->getOperand(1)))
13694     return DCI.DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0),
13695                            N0->getOperand(1));
13696 
13697   return SDValue();
13698 }
13699 
13700 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
13701 /// are normal, non-volatile loads.  If so, it is profitable to bitcast an
13702 /// i64 vector to have f64 elements, since the value can then be loaded
13703 /// directly into a VFP register.
13704 static bool hasNormalLoadOperand(SDNode *N) {
13705   unsigned NumElts = N->getValueType(0).getVectorNumElements();
13706   for (unsigned i = 0; i < NumElts; ++i) {
13707     SDNode *Elt = N->getOperand(i).getNode();
13708     if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
13709       return true;
13710   }
13711   return false;
13712 }
13713 
13714 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
13715 /// ISD::BUILD_VECTOR.
13716 static SDValue PerformBUILD_VECTORCombine(SDNode *N,
13717                                           TargetLowering::DAGCombinerInfo &DCI,
13718                                           const ARMSubtarget *Subtarget) {
13719   // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
13720   // VMOVRRD is introduced when legalizing i64 types.  It forces the i64 value
13721   // into a pair of GPRs, which is fine when the value is used as a scalar,
13722   // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
13723   SelectionDAG &DAG = DCI.DAG;
13724   if (N->getNumOperands() == 2)
13725     if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
13726       return RV;
13727 
13728   // Load i64 elements as f64 values so that type legalization does not split
13729   // them up into i32 values.
13730   EVT VT = N->getValueType(0);
13731   if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
13732     return SDValue();
13733   SDLoc dl(N);
13734   SmallVector<SDValue, 8> Ops;
13735   unsigned NumElts = VT.getVectorNumElements();
13736   for (unsigned i = 0; i < NumElts; ++i) {
13737     SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
13738     Ops.push_back(V);
13739     // Make the DAGCombiner fold the bitcast.
13740     DCI.AddToWorklist(V.getNode());
13741   }
13742   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
13743   SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
13744   return DAG.getNode(ISD::BITCAST, dl, VT, BV);
13745 }
13746 
13747 /// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
13748 static SDValue
13749 PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
13750   // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
13751   // At that time, we may have inserted bitcasts from integer to float.
13752   // If these bitcasts have survived DAGCombine, change the lowering of this
13753   // BUILD_VECTOR in something more vector friendly, i.e., that does not
13754   // force to use floating point types.
13755 
13756   // Make sure we can change the type of the vector.
13757   // This is possible iff:
13758   // 1. The vector is only used in a bitcast to a integer type. I.e.,
13759   //    1.1. Vector is used only once.
13760   //    1.2. Use is a bit convert to an integer type.
13761   // 2. The size of its operands are 32-bits (64-bits are not legal).
13762   EVT VT = N->getValueType(0);
13763   EVT EltVT = VT.getVectorElementType();
13764 
13765   // Check 1.1. and 2.
13766   if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
13767     return SDValue();
13768 
13769   // By construction, the input type must be float.
13770   assert(EltVT == MVT::f32 && "Unexpected type!");
13771 
13772   // Check 1.2.
13773   SDNode *Use = *N->use_begin();
13774   if (Use->getOpcode() != ISD::BITCAST ||
13775       Use->getValueType(0).isFloatingPoint())
13776     return SDValue();
13777 
13778   // Check profitability.
13779   // Model is, if more than half of the relevant operands are bitcast from
13780   // i32, turn the build_vector into a sequence of insert_vector_elt.
13781   // Relevant operands are everything that is not statically
13782   // (i.e., at compile time) bitcasted.
13783   unsigned NumOfBitCastedElts = 0;
13784   unsigned NumElts = VT.getVectorNumElements();
13785   unsigned NumOfRelevantElts = NumElts;
13786   for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
13787     SDValue Elt = N->getOperand(Idx);
13788     if (Elt->getOpcode() == ISD::BITCAST) {
13789       // Assume only bit cast to i32 will go away.
13790       if (Elt->getOperand(0).getValueType() == MVT::i32)
13791         ++NumOfBitCastedElts;
13792     } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
13793       // Constants are statically casted, thus do not count them as
13794       // relevant operands.
13795       --NumOfRelevantElts;
13796   }
13797 
13798   // Check if more than half of the elements require a non-free bitcast.
13799   if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
13800     return SDValue();
13801 
13802   SelectionDAG &DAG = DCI.DAG;
13803   // Create the new vector type.
13804   EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
13805   // Check if the type is legal.
13806   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
13807   if (!TLI.isTypeLegal(VecVT))
13808     return SDValue();
13809 
13810   // Combine:
13811   // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
13812   // => BITCAST INSERT_VECTOR_ELT
13813   //                      (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
13814   //                      (BITCAST EN), N.
13815   SDValue Vec = DAG.getUNDEF(VecVT);
13816   SDLoc dl(N);
13817   for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
13818     SDValue V = N->getOperand(Idx);
13819     if (V.isUndef())
13820       continue;
13821     if (V.getOpcode() == ISD::BITCAST &&
13822         V->getOperand(0).getValueType() == MVT::i32)
13823       // Fold obvious case.
13824       V = V.getOperand(0);
13825     else {
13826       V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
13827       // Make the DAGCombiner fold the bitcasts.
13828       DCI.AddToWorklist(V.getNode());
13829     }
13830     SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
13831     Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
13832   }
13833   Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
13834   // Make the DAGCombiner fold the bitcasts.
13835   DCI.AddToWorklist(Vec.getNode());
13836   return Vec;
13837 }
13838 
13839 static SDValue
13840 PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
13841   EVT VT = N->getValueType(0);
13842   SDValue Op = N->getOperand(0);
13843   SDLoc dl(N);
13844 
13845   // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x)
13846   if (Op->getOpcode() == ARMISD::PREDICATE_CAST) {
13847     // If the valuetypes are the same, we can remove the cast entirely.
13848     if (Op->getOperand(0).getValueType() == VT)
13849       return Op->getOperand(0);
13850     return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
13851   }
13852 
13853   // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce
13854   // more VPNOT which might get folded as else predicates.
13855   if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) {
13856     SDValue X =
13857         DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
13858     SDValue C = DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
13859                                 DCI.DAG.getConstant(65535, dl, MVT::i32));
13860     return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C);
13861   }
13862 
13863   // Only the bottom 16 bits of the source register are used.
13864   if (Op.getValueType() == MVT::i32) {
13865     APInt DemandedMask = APInt::getLowBitsSet(32, 16);
13866     const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
13867     if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
13868       return SDValue(N, 0);
13869   }
13870   return SDValue();
13871 }
13872 
13873 static SDValue
13874 PerformVECTOR_REG_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
13875                               const ARMSubtarget *ST) {
13876   EVT VT = N->getValueType(0);
13877   SDValue Op = N->getOperand(0);
13878   SDLoc dl(N);
13879 
13880   // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
13881   if (ST->isLittle())
13882     return DCI.DAG.getNode(ISD::BITCAST, dl, VT, Op);
13883 
13884   // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
13885   if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
13886     // If the valuetypes are the same, we can remove the cast entirely.
13887     if (Op->getOperand(0).getValueType() == VT)
13888       return Op->getOperand(0);
13889     return DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
13890   }
13891 
13892   return SDValue();
13893 }
13894 
13895 static SDValue PerformVCMPCombine(SDNode *N,
13896                                   TargetLowering::DAGCombinerInfo &DCI,
13897                                   const ARMSubtarget *Subtarget) {
13898   if (!Subtarget->hasMVEIntegerOps())
13899     return SDValue();
13900 
13901   EVT VT = N->getValueType(0);
13902   SDValue Op0 = N->getOperand(0);
13903   SDValue Op1 = N->getOperand(1);
13904   ARMCC::CondCodes Cond =
13905       (ARMCC::CondCodes)cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
13906   SDLoc dl(N);
13907 
13908   // vcmp X, 0, cc -> vcmpz X, cc
13909   if (isZeroVector(Op1))
13910     return DCI.DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0,
13911                            N->getOperand(2));
13912 
13913   unsigned SwappedCond = getSwappedCondition(Cond);
13914   if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) {
13915     // vcmp 0, X, cc -> vcmpz X, reversed(cc)
13916     if (isZeroVector(Op0))
13917       return DCI.DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,
13918                              DCI.DAG.getConstant(SwappedCond, dl, MVT::i32));
13919     // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc)
13920     if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP)
13921       return DCI.DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,
13922                              DCI.DAG.getConstant(SwappedCond, dl, MVT::i32));
13923   }
13924 
13925   return SDValue();
13926 }
13927 
13928 /// PerformInsertEltCombine - Target-specific dag combine xforms for
13929 /// ISD::INSERT_VECTOR_ELT.
13930 static SDValue PerformInsertEltCombine(SDNode *N,
13931                                        TargetLowering::DAGCombinerInfo &DCI) {
13932   // Bitcast an i64 load inserted into a vector to f64.
13933   // Otherwise, the i64 value will be legalized to a pair of i32 values.
13934   EVT VT = N->getValueType(0);
13935   SDNode *Elt = N->getOperand(1).getNode();
13936   if (VT.getVectorElementType() != MVT::i64 ||
13937       !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
13938     return SDValue();
13939 
13940   SelectionDAG &DAG = DCI.DAG;
13941   SDLoc dl(N);
13942   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
13943                                  VT.getVectorNumElements());
13944   SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
13945   SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
13946   // Make the DAGCombiner fold the bitcasts.
13947   DCI.AddToWorklist(Vec.getNode());
13948   DCI.AddToWorklist(V.getNode());
13949   SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
13950                                Vec, V, N->getOperand(2));
13951   return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
13952 }
13953 
13954 static SDValue PerformExtractEltCombine(SDNode *N,
13955                                         TargetLowering::DAGCombinerInfo &DCI) {
13956   SDValue Op0 = N->getOperand(0);
13957   EVT VT = N->getValueType(0);
13958   SDLoc dl(N);
13959 
13960   // extract (vdup x) -> x
13961   if (Op0->getOpcode() == ARMISD::VDUP) {
13962     SDValue X = Op0->getOperand(0);
13963     if (VT == MVT::f16 && X.getValueType() == MVT::i32)
13964       return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X);
13965     if (VT == MVT::i32 && X.getValueType() == MVT::f16)
13966       return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X);
13967 
13968     while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST)
13969       X = X->getOperand(0);
13970     if (X.getValueType() == VT)
13971       return X;
13972   }
13973 
13974   return SDValue();
13975 }
13976 
13977 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
13978 /// ISD::VECTOR_SHUFFLE.
13979 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
13980   // The LLVM shufflevector instruction does not require the shuffle mask
13981   // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
13982   // have that requirement.  When translating to ISD::VECTOR_SHUFFLE, if the
13983   // operands do not match the mask length, they are extended by concatenating
13984   // them with undef vectors.  That is probably the right thing for other
13985   // targets, but for NEON it is better to concatenate two double-register
13986   // size vector operands into a single quad-register size vector.  Do that
13987   // transformation here:
13988   //   shuffle(concat(v1, undef), concat(v2, undef)) ->
13989   //   shuffle(concat(v1, v2), undef)
13990   SDValue Op0 = N->getOperand(0);
13991   SDValue Op1 = N->getOperand(1);
13992   if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
13993       Op1.getOpcode() != ISD::CONCAT_VECTORS ||
13994       Op0.getNumOperands() != 2 ||
13995       Op1.getNumOperands() != 2)
13996     return SDValue();
13997   SDValue Concat0Op1 = Op0.getOperand(1);
13998   SDValue Concat1Op1 = Op1.getOperand(1);
13999   if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
14000     return SDValue();
14001   // Skip the transformation if any of the types are illegal.
14002   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14003   EVT VT = N->getValueType(0);
14004   if (!TLI.isTypeLegal(VT) ||
14005       !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
14006       !TLI.isTypeLegal(Concat1Op1.getValueType()))
14007     return SDValue();
14008 
14009   SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
14010                                   Op0.getOperand(0), Op1.getOperand(0));
14011   // Translate the shuffle mask.
14012   SmallVector<int, 16> NewMask;
14013   unsigned NumElts = VT.getVectorNumElements();
14014   unsigned HalfElts = NumElts/2;
14015   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
14016   for (unsigned n = 0; n < NumElts; ++n) {
14017     int MaskElt = SVN->getMaskElt(n);
14018     int NewElt = -1;
14019     if (MaskElt < (int)HalfElts)
14020       NewElt = MaskElt;
14021     else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
14022       NewElt = HalfElts + MaskElt - NumElts;
14023     NewMask.push_back(NewElt);
14024   }
14025   return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
14026                               DAG.getUNDEF(VT), NewMask);
14027 }
14028 
14029 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
14030 /// NEON load/store intrinsics, and generic vector load/stores, to merge
14031 /// base address updates.
14032 /// For generic load/stores, the memory type is assumed to be a vector.
14033 /// The caller is assumed to have checked legality.
14034 static SDValue CombineBaseUpdate(SDNode *N,
14035                                  TargetLowering::DAGCombinerInfo &DCI) {
14036   SelectionDAG &DAG = DCI.DAG;
14037   const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
14038                             N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
14039   const bool isStore = N->getOpcode() == ISD::STORE;
14040   const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
14041   SDValue Addr = N->getOperand(AddrOpIdx);
14042   MemSDNode *MemN = cast<MemSDNode>(N);
14043   SDLoc dl(N);
14044 
14045   // Search for a use of the address operand that is an increment.
14046   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
14047          UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
14048     SDNode *User = *UI;
14049     if (User->getOpcode() != ISD::ADD ||
14050         UI.getUse().getResNo() != Addr.getResNo())
14051       continue;
14052 
14053     // Check that the add is independent of the load/store.  Otherwise, folding
14054     // it would create a cycle. We can avoid searching through Addr as it's a
14055     // predecessor to both.
14056     SmallPtrSet<const SDNode *, 32> Visited;
14057     SmallVector<const SDNode *, 16> Worklist;
14058     Visited.insert(Addr.getNode());
14059     Worklist.push_back(N);
14060     Worklist.push_back(User);
14061     if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
14062         SDNode::hasPredecessorHelper(User, Visited, Worklist))
14063       continue;
14064 
14065     // Find the new opcode for the updating load/store.
14066     bool isLoadOp = true;
14067     bool isLaneOp = false;
14068     unsigned NewOpc = 0;
14069     unsigned NumVecs = 0;
14070     if (isIntrinsic) {
14071       unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
14072       switch (IntNo) {
14073       default: llvm_unreachable("unexpected intrinsic for Neon base update");
14074       case Intrinsic::arm_neon_vld1:     NewOpc = ARMISD::VLD1_UPD;
14075         NumVecs = 1; break;
14076       case Intrinsic::arm_neon_vld2:     NewOpc = ARMISD::VLD2_UPD;
14077         NumVecs = 2; break;
14078       case Intrinsic::arm_neon_vld3:     NewOpc = ARMISD::VLD3_UPD;
14079         NumVecs = 3; break;
14080       case Intrinsic::arm_neon_vld4:     NewOpc = ARMISD::VLD4_UPD;
14081         NumVecs = 4; break;
14082       case Intrinsic::arm_neon_vld1x2:
14083       case Intrinsic::arm_neon_vld1x3:
14084       case Intrinsic::arm_neon_vld1x4:
14085       case Intrinsic::arm_neon_vld2dup:
14086       case Intrinsic::arm_neon_vld3dup:
14087       case Intrinsic::arm_neon_vld4dup:
14088         // TODO: Support updating VLD1x and VLDxDUP nodes. For now, we just skip
14089         // combining base updates for such intrinsics.
14090         continue;
14091       case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
14092         NumVecs = 2; isLaneOp = true; break;
14093       case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
14094         NumVecs = 3; isLaneOp = true; break;
14095       case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
14096         NumVecs = 4; isLaneOp = true; break;
14097       case Intrinsic::arm_neon_vst1:     NewOpc = ARMISD::VST1_UPD;
14098         NumVecs = 1; isLoadOp = false; break;
14099       case Intrinsic::arm_neon_vst2:     NewOpc = ARMISD::VST2_UPD;
14100         NumVecs = 2; isLoadOp = false; break;
14101       case Intrinsic::arm_neon_vst3:     NewOpc = ARMISD::VST3_UPD;
14102         NumVecs = 3; isLoadOp = false; break;
14103       case Intrinsic::arm_neon_vst4:     NewOpc = ARMISD::VST4_UPD;
14104         NumVecs = 4; isLoadOp = false; break;
14105       case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
14106         NumVecs = 2; isLoadOp = false; isLaneOp = true; break;
14107       case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
14108         NumVecs = 3; isLoadOp = false; isLaneOp = true; break;
14109       case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
14110         NumVecs = 4; isLoadOp = false; isLaneOp = true; break;
14111       }
14112     } else {
14113       isLaneOp = true;
14114       switch (N->getOpcode()) {
14115       default: llvm_unreachable("unexpected opcode for Neon base update");
14116       case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break;
14117       case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
14118       case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
14119       case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
14120       case ISD::LOAD:       NewOpc = ARMISD::VLD1_UPD;
14121         NumVecs = 1; isLaneOp = false; break;
14122       case ISD::STORE:      NewOpc = ARMISD::VST1_UPD;
14123         NumVecs = 1; isLaneOp = false; isLoadOp = false; break;
14124       }
14125     }
14126 
14127     // Find the size of memory referenced by the load/store.
14128     EVT VecTy;
14129     if (isLoadOp) {
14130       VecTy = N->getValueType(0);
14131     } else if (isIntrinsic) {
14132       VecTy = N->getOperand(AddrOpIdx+1).getValueType();
14133     } else {
14134       assert(isStore && "Node has to be a load, a store, or an intrinsic!");
14135       VecTy = N->getOperand(1).getValueType();
14136     }
14137 
14138     unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
14139     if (isLaneOp)
14140       NumBytes /= VecTy.getVectorNumElements();
14141 
14142     // If the increment is a constant, it must match the memory ref size.
14143     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
14144     ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
14145     if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) {
14146       // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
14147       // separate instructions that make it harder to use a non-constant update.
14148       continue;
14149     }
14150 
14151     // OK, we found an ADD we can fold into the base update.
14152     // Now, create a _UPD node, taking care of not breaking alignment.
14153 
14154     EVT AlignedVecTy = VecTy;
14155     unsigned Alignment = MemN->getAlignment();
14156 
14157     // If this is a less-than-standard-aligned load/store, change the type to
14158     // match the standard alignment.
14159     // The alignment is overlooked when selecting _UPD variants; and it's
14160     // easier to introduce bitcasts here than fix that.
14161     // There are 3 ways to get to this base-update combine:
14162     // - intrinsics: they are assumed to be properly aligned (to the standard
14163     //   alignment of the memory type), so we don't need to do anything.
14164     // - ARMISD::VLDx nodes: they are only generated from the aforementioned
14165     //   intrinsics, so, likewise, there's nothing to do.
14166     // - generic load/store instructions: the alignment is specified as an
14167     //   explicit operand, rather than implicitly as the standard alignment
14168     //   of the memory type (like the intrisics).  We need to change the
14169     //   memory type to match the explicit alignment.  That way, we don't
14170     //   generate non-standard-aligned ARMISD::VLDx nodes.
14171     if (isa<LSBaseSDNode>(N)) {
14172       if (Alignment == 0)
14173         Alignment = 1;
14174       if (Alignment < VecTy.getScalarSizeInBits() / 8) {
14175         MVT EltTy = MVT::getIntegerVT(Alignment * 8);
14176         assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
14177         assert(!isLaneOp && "Unexpected generic load/store lane.");
14178         unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
14179         AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
14180       }
14181       // Don't set an explicit alignment on regular load/stores that we want
14182       // to transform to VLD/VST 1_UPD nodes.
14183       // This matches the behavior of regular load/stores, which only get an
14184       // explicit alignment if the MMO alignment is larger than the standard
14185       // alignment of the memory type.
14186       // Intrinsics, however, always get an explicit alignment, set to the
14187       // alignment of the MMO.
14188       Alignment = 1;
14189     }
14190 
14191     // Create the new updating load/store node.
14192     // First, create an SDVTList for the new updating node's results.
14193     EVT Tys[6];
14194     unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
14195     unsigned n;
14196     for (n = 0; n < NumResultVecs; ++n)
14197       Tys[n] = AlignedVecTy;
14198     Tys[n++] = MVT::i32;
14199     Tys[n] = MVT::Other;
14200     SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2));
14201 
14202     // Then, gather the new node's operands.
14203     SmallVector<SDValue, 8> Ops;
14204     Ops.push_back(N->getOperand(0)); // incoming chain
14205     Ops.push_back(N->getOperand(AddrOpIdx));
14206     Ops.push_back(Inc);
14207 
14208     if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
14209       // Try to match the intrinsic's signature
14210       Ops.push_back(StN->getValue());
14211     } else {
14212       // Loads (and of course intrinsics) match the intrinsics' signature,
14213       // so just add all but the alignment operand.
14214       for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i)
14215         Ops.push_back(N->getOperand(i));
14216     }
14217 
14218     // For all node types, the alignment operand is always the last one.
14219     Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));
14220 
14221     // If this is a non-standard-aligned STORE, the penultimate operand is the
14222     // stored value.  Bitcast it to the aligned type.
14223     if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
14224       SDValue &StVal = Ops[Ops.size()-2];
14225       StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
14226     }
14227 
14228     EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
14229     SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
14230                                            MemN->getMemOperand());
14231 
14232     // Update the uses.
14233     SmallVector<SDValue, 5> NewResults;
14234     for (unsigned i = 0; i < NumResultVecs; ++i)
14235       NewResults.push_back(SDValue(UpdN.getNode(), i));
14236 
14237     // If this is an non-standard-aligned LOAD, the first result is the loaded
14238     // value.  Bitcast it to the expected result type.
14239     if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
14240       SDValue &LdVal = NewResults[0];
14241       LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
14242     }
14243 
14244     NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
14245     DCI.CombineTo(N, NewResults);
14246     DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
14247 
14248     break;
14249   }
14250   return SDValue();
14251 }
14252 
14253 static SDValue PerformVLDCombine(SDNode *N,
14254                                  TargetLowering::DAGCombinerInfo &DCI) {
14255   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14256     return SDValue();
14257 
14258   return CombineBaseUpdate(N, DCI);
14259 }
14260 
14261 static SDValue PerformMVEVLDCombine(SDNode *N,
14262                                     TargetLowering::DAGCombinerInfo &DCI) {
14263   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14264     return SDValue();
14265 
14266   SelectionDAG &DAG = DCI.DAG;
14267   SDValue Addr = N->getOperand(2);
14268   MemSDNode *MemN = cast<MemSDNode>(N);
14269   SDLoc dl(N);
14270 
14271   // For the stores, where there are multiple intrinsics we only actually want
14272   // to post-inc the last of the them.
14273   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
14274   if (IntNo == Intrinsic::arm_mve_vst2q &&
14275       cast<ConstantSDNode>(N->getOperand(5))->getZExtValue() != 1)
14276     return SDValue();
14277   if (IntNo == Intrinsic::arm_mve_vst4q &&
14278       cast<ConstantSDNode>(N->getOperand(7))->getZExtValue() != 3)
14279     return SDValue();
14280 
14281   // Search for a use of the address operand that is an increment.
14282   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
14283                             UE = Addr.getNode()->use_end();
14284        UI != UE; ++UI) {
14285     SDNode *User = *UI;
14286     if (User->getOpcode() != ISD::ADD ||
14287         UI.getUse().getResNo() != Addr.getResNo())
14288       continue;
14289 
14290     // Check that the add is independent of the load/store.  Otherwise, folding
14291     // it would create a cycle. We can avoid searching through Addr as it's a
14292     // predecessor to both.
14293     SmallPtrSet<const SDNode *, 32> Visited;
14294     SmallVector<const SDNode *, 16> Worklist;
14295     Visited.insert(Addr.getNode());
14296     Worklist.push_back(N);
14297     Worklist.push_back(User);
14298     if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
14299         SDNode::hasPredecessorHelper(User, Visited, Worklist))
14300       continue;
14301 
14302     // Find the new opcode for the updating load/store.
14303     bool isLoadOp = true;
14304     unsigned NewOpc = 0;
14305     unsigned NumVecs = 0;
14306     switch (IntNo) {
14307     default:
14308       llvm_unreachable("unexpected intrinsic for MVE VLDn combine");
14309     case Intrinsic::arm_mve_vld2q:
14310       NewOpc = ARMISD::VLD2_UPD;
14311       NumVecs = 2;
14312       break;
14313     case Intrinsic::arm_mve_vld4q:
14314       NewOpc = ARMISD::VLD4_UPD;
14315       NumVecs = 4;
14316       break;
14317     case Intrinsic::arm_mve_vst2q:
14318       NewOpc = ARMISD::VST2_UPD;
14319       NumVecs = 2;
14320       isLoadOp = false;
14321       break;
14322     case Intrinsic::arm_mve_vst4q:
14323       NewOpc = ARMISD::VST4_UPD;
14324       NumVecs = 4;
14325       isLoadOp = false;
14326       break;
14327     }
14328 
14329     // Find the size of memory referenced by the load/store.
14330     EVT VecTy;
14331     if (isLoadOp) {
14332       VecTy = N->getValueType(0);
14333     } else {
14334       VecTy = N->getOperand(3).getValueType();
14335     }
14336 
14337     unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
14338 
14339     // If the increment is a constant, it must match the memory ref size.
14340     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
14341     ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
14342     if (!CInc || CInc->getZExtValue() != NumBytes)
14343       continue;
14344 
14345     // Create the new updating load/store node.
14346     // First, create an SDVTList for the new updating node's results.
14347     EVT Tys[6];
14348     unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
14349     unsigned n;
14350     for (n = 0; n < NumResultVecs; ++n)
14351       Tys[n] = VecTy;
14352     Tys[n++] = MVT::i32;
14353     Tys[n] = MVT::Other;
14354     SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
14355 
14356     // Then, gather the new node's operands.
14357     SmallVector<SDValue, 8> Ops;
14358     Ops.push_back(N->getOperand(0)); // incoming chain
14359     Ops.push_back(N->getOperand(2)); // ptr
14360     Ops.push_back(Inc);
14361 
14362     for (unsigned i = 3; i < N->getNumOperands(); ++i)
14363       Ops.push_back(N->getOperand(i));
14364 
14365     SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy,
14366                                            MemN->getMemOperand());
14367 
14368     // Update the uses.
14369     SmallVector<SDValue, 5> NewResults;
14370     for (unsigned i = 0; i < NumResultVecs; ++i)
14371       NewResults.push_back(SDValue(UpdN.getNode(), i));
14372 
14373     NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
14374     DCI.CombineTo(N, NewResults);
14375     DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
14376 
14377     break;
14378   }
14379 
14380   return SDValue();
14381 }
14382 
14383 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
14384 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
14385 /// are also VDUPLANEs.  If so, combine them to a vldN-dup operation and
14386 /// return true.
14387 static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
14388   SelectionDAG &DAG = DCI.DAG;
14389   EVT VT = N->getValueType(0);
14390   // vldN-dup instructions only support 64-bit vectors for N > 1.
14391   if (!VT.is64BitVector())
14392     return false;
14393 
14394   // Check if the VDUPLANE operand is a vldN-dup intrinsic.
14395   SDNode *VLD = N->getOperand(0).getNode();
14396   if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
14397     return false;
14398   unsigned NumVecs = 0;
14399   unsigned NewOpc = 0;
14400   unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
14401   if (IntNo == Intrinsic::arm_neon_vld2lane) {
14402     NumVecs = 2;
14403     NewOpc = ARMISD::VLD2DUP;
14404   } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
14405     NumVecs = 3;
14406     NewOpc = ARMISD::VLD3DUP;
14407   } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
14408     NumVecs = 4;
14409     NewOpc = ARMISD::VLD4DUP;
14410   } else {
14411     return false;
14412   }
14413 
14414   // First check that all the vldN-lane uses are VDUPLANEs and that the lane
14415   // numbers match the load.
14416   unsigned VLDLaneNo =
14417     cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue();
14418   for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
14419        UI != UE; ++UI) {
14420     // Ignore uses of the chain result.
14421     if (UI.getUse().getResNo() == NumVecs)
14422       continue;
14423     SDNode *User = *UI;
14424     if (User->getOpcode() != ARMISD::VDUPLANE ||
14425         VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
14426       return false;
14427   }
14428 
14429   // Create the vldN-dup node.
14430   EVT Tys[5];
14431   unsigned n;
14432   for (n = 0; n < NumVecs; ++n)
14433     Tys[n] = VT;
14434   Tys[n] = MVT::Other;
14435   SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1));
14436   SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
14437   MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
14438   SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
14439                                            Ops, VLDMemInt->getMemoryVT(),
14440                                            VLDMemInt->getMemOperand());
14441 
14442   // Update the uses.
14443   for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
14444        UI != UE; ++UI) {
14445     unsigned ResNo = UI.getUse().getResNo();
14446     // Ignore uses of the chain result.
14447     if (ResNo == NumVecs)
14448       continue;
14449     SDNode *User = *UI;
14450     DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
14451   }
14452 
14453   // Now the vldN-lane intrinsic is dead except for its chain result.
14454   // Update uses of the chain.
14455   std::vector<SDValue> VLDDupResults;
14456   for (unsigned n = 0; n < NumVecs; ++n)
14457     VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
14458   VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
14459   DCI.CombineTo(VLD, VLDDupResults);
14460 
14461   return true;
14462 }
14463 
14464 /// PerformVDUPLANECombine - Target-specific dag combine xforms for
14465 /// ARMISD::VDUPLANE.
14466 static SDValue PerformVDUPLANECombine(SDNode *N,
14467                                       TargetLowering::DAGCombinerInfo &DCI,
14468                                       const ARMSubtarget *Subtarget) {
14469   SDValue Op = N->getOperand(0);
14470   EVT VT = N->getValueType(0);
14471 
14472   // On MVE, we just convert the VDUPLANE to a VDUP with an extract.
14473   if (Subtarget->hasMVEIntegerOps()) {
14474     EVT ExtractVT = VT.getVectorElementType();
14475     // We need to ensure we are creating a legal type.
14476     if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT))
14477       ExtractVT = MVT::i32;
14478     SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT,
14479                               N->getOperand(0), N->getOperand(1));
14480     return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract);
14481   }
14482 
14483   // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
14484   // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
14485   if (CombineVLDDUP(N, DCI))
14486     return SDValue(N, 0);
14487 
14488   // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
14489   // redundant.  Ignore bit_converts for now; element sizes are checked below.
14490   while (Op.getOpcode() == ISD::BITCAST)
14491     Op = Op.getOperand(0);
14492   if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
14493     return SDValue();
14494 
14495   // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
14496   unsigned EltSize = Op.getScalarValueSizeInBits();
14497   // The canonical VMOV for a zero vector uses a 32-bit element size.
14498   unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
14499   unsigned EltBits;
14500   if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
14501     EltSize = 8;
14502   if (EltSize > VT.getScalarSizeInBits())
14503     return SDValue();
14504 
14505   return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
14506 }
14507 
14508 /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
14509 static SDValue PerformVDUPCombine(SDNode *N,
14510                                   TargetLowering::DAGCombinerInfo &DCI,
14511                                   const ARMSubtarget *Subtarget) {
14512   SelectionDAG &DAG = DCI.DAG;
14513   SDValue Op = N->getOperand(0);
14514   SDLoc dl(N);
14515 
14516   if (Subtarget->hasMVEIntegerOps()) {
14517     // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will
14518     // need to come from a GPR.
14519     if (Op.getValueType() == MVT::f32)
14520       return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
14521                              DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op));
14522     else if (Op.getValueType() == MVT::f16)
14523       return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
14524                              DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op));
14525   }
14526 
14527   if (!Subtarget->hasNEON())
14528     return SDValue();
14529 
14530   // Match VDUP(LOAD) -> VLD1DUP.
14531   // We match this pattern here rather than waiting for isel because the
14532   // transform is only legal for unindexed loads.
14533   LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
14534   if (LD && Op.hasOneUse() && LD->isUnindexed() &&
14535       LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
14536     SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1),
14537                       DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) };
14538     SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
14539     SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys,
14540                                              Ops, LD->getMemoryVT(),
14541                                              LD->getMemOperand());
14542     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
14543     return VLDDup;
14544   }
14545 
14546   return SDValue();
14547 }
14548 
14549 static SDValue PerformLOADCombine(SDNode *N,
14550                                   TargetLowering::DAGCombinerInfo &DCI) {
14551   EVT VT = N->getValueType(0);
14552 
14553   // If this is a legal vector load, try to combine it into a VLD1_UPD.
14554   if (ISD::isNormalLoad(N) && VT.isVector() &&
14555       DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
14556     return CombineBaseUpdate(N, DCI);
14557 
14558   return SDValue();
14559 }
14560 
14561 // Optimize trunc store (of multiple scalars) to shuffle and store.  First,
14562 // pack all of the elements in one place.  Next, store to memory in fewer
14563 // chunks.
14564 static SDValue PerformTruncatingStoreCombine(StoreSDNode *St,
14565                                              SelectionDAG &DAG) {
14566   SDValue StVal = St->getValue();
14567   EVT VT = StVal.getValueType();
14568   if (!St->isTruncatingStore() || !VT.isVector())
14569     return SDValue();
14570   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14571   EVT StVT = St->getMemoryVT();
14572   unsigned NumElems = VT.getVectorNumElements();
14573   assert(StVT != VT && "Cannot truncate to the same type");
14574   unsigned FromEltSz = VT.getScalarSizeInBits();
14575   unsigned ToEltSz = StVT.getScalarSizeInBits();
14576 
14577   // From, To sizes and ElemCount must be pow of two
14578   if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz))
14579     return SDValue();
14580 
14581   // We are going to use the original vector elt for storing.
14582   // Accumulated smaller vector elements must be a multiple of the store size.
14583   if (0 != (NumElems * FromEltSz) % ToEltSz)
14584     return SDValue();
14585 
14586   unsigned SizeRatio = FromEltSz / ToEltSz;
14587   assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
14588 
14589   // Create a type on which we perform the shuffle.
14590   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
14591                                    NumElems * SizeRatio);
14592   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
14593 
14594   SDLoc DL(St);
14595   SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
14596   SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
14597   for (unsigned i = 0; i < NumElems; ++i)
14598     ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1
14599                                                       : i * SizeRatio;
14600 
14601   // Can't shuffle using an illegal type.
14602   if (!TLI.isTypeLegal(WideVecVT))
14603     return SDValue();
14604 
14605   SDValue Shuff = DAG.getVectorShuffle(
14606       WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec);
14607   // At this point all of the data is stored at the bottom of the
14608   // register. We now need to save it to mem.
14609 
14610   // Find the largest store unit
14611   MVT StoreType = MVT::i8;
14612   for (MVT Tp : MVT::integer_valuetypes()) {
14613     if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
14614       StoreType = Tp;
14615   }
14616   // Didn't find a legal store type.
14617   if (!TLI.isTypeLegal(StoreType))
14618     return SDValue();
14619 
14620   // Bitcast the original vector into a vector of store-size units
14621   EVT StoreVecVT =
14622       EVT::getVectorVT(*DAG.getContext(), StoreType,
14623                        VT.getSizeInBits() / EVT(StoreType).getSizeInBits());
14624   assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
14625   SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
14626   SmallVector<SDValue, 8> Chains;
14627   SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
14628                                       TLI.getPointerTy(DAG.getDataLayout()));
14629   SDValue BasePtr = St->getBasePtr();
14630 
14631   // Perform one or more big stores into memory.
14632   unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits();
14633   for (unsigned I = 0; I < E; I++) {
14634     SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType,
14635                                  ShuffWide, DAG.getIntPtrConstant(I, DL));
14636     SDValue Ch =
14637         DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),
14638                      St->getAlignment(), St->getMemOperand()->getFlags());
14639     BasePtr =
14640         DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);
14641     Chains.push_back(Ch);
14642   }
14643   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
14644 }
14645 
14646 // Try taking a single vector store from an truncate (which would otherwise turn
14647 // into an expensive buildvector) and splitting it into a series of narrowing
14648 // stores.
14649 static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
14650                                                  SelectionDAG &DAG) {
14651   if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
14652     return SDValue();
14653   SDValue Trunc = St->getValue();
14654   if (Trunc->getOpcode() != ISD::TRUNCATE && Trunc->getOpcode() != ISD::FP_ROUND)
14655     return SDValue();
14656   EVT FromVT = Trunc->getOperand(0).getValueType();
14657   EVT ToVT = Trunc.getValueType();
14658   if (!ToVT.isVector())
14659     return SDValue();
14660   assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements());
14661   EVT ToEltVT = ToVT.getVectorElementType();
14662   EVT FromEltVT = FromVT.getVectorElementType();
14663 
14664   unsigned NumElements = 0;
14665   if (FromEltVT == MVT::i32 && (ToEltVT == MVT::i16 || ToEltVT == MVT::i8))
14666     NumElements = 4;
14667   if (FromEltVT == MVT::i16 && ToEltVT == MVT::i8)
14668     NumElements = 8;
14669   if (FromEltVT == MVT::f32 && ToEltVT == MVT::f16)
14670     NumElements = 4;
14671   if (NumElements == 0 ||
14672       (FromEltVT != MVT::f32 && FromVT.getVectorNumElements() == NumElements) ||
14673       FromVT.getVectorNumElements() % NumElements != 0)
14674     return SDValue();
14675 
14676   // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so
14677   // use the VMOVN over splitting the store. We are looking for patterns of:
14678   // !rev: 0 N 1 N+1 2 N+2 ...
14679   //  rev: N 0 N+1 1 N+2 2 ...
14680   // The shuffle may either be a single source (in which case N = NumElts/2) or
14681   // two inputs extended with concat to the same size (in which case N =
14682   // NumElts).
14683   auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) {
14684     ArrayRef<int> M = SVN->getMask();
14685     unsigned NumElts = ToVT.getVectorNumElements();
14686     if (SVN->getOperand(1).isUndef())
14687       NumElts /= 2;
14688 
14689     unsigned Off0 = Rev ? NumElts : 0;
14690     unsigned Off1 = Rev ? 0 : NumElts;
14691 
14692     for (unsigned I = 0; I < NumElts; I += 2) {
14693       if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2))
14694         return false;
14695       if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2))
14696         return false;
14697     }
14698 
14699     return true;
14700   };
14701 
14702   // It may be preferable to keep the store unsplit as the trunc may end up
14703   // being removed. Check that here.
14704   if (Trunc.getOperand(0).getOpcode() == ISD::SMIN) {
14705     if (SDValue U = PerformVQDMULHCombine(Trunc.getOperand(0).getNode(), DAG)) {
14706       DAG.ReplaceAllUsesWith(Trunc.getOperand(0), U);
14707       return SDValue();
14708     }
14709   }
14710   if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0)))
14711     if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true))
14712       return SDValue();
14713 
14714   LLVMContext &C = *DAG.getContext();
14715   SDLoc DL(St);
14716   // Details about the old store
14717   SDValue Ch = St->getChain();
14718   SDValue BasePtr = St->getBasePtr();
14719   Align Alignment = St->getOriginalAlign();
14720   MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
14721   AAMDNodes AAInfo = St->getAAInfo();
14722 
14723   // We split the store into slices of NumElements. fp16 trunc stores are vcvt
14724   // and then stored as truncating integer stores.
14725   EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements);
14726   EVT NewToVT = EVT::getVectorVT(
14727       C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements);
14728 
14729   SmallVector<SDValue, 4> Stores;
14730   for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
14731     unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
14732     SDValue NewPtr =
14733         DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset));
14734 
14735     SDValue Extract =
14736         DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
14737                     DAG.getConstant(i * NumElements, DL, MVT::i32));
14738 
14739     if (ToEltVT == MVT::f16) {
14740       SDValue FPTrunc =
14741           DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16),
14742                       Extract, DAG.getConstant(0, DL, MVT::i32));
14743       Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc);
14744     }
14745 
14746     SDValue Store = DAG.getTruncStore(
14747         Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
14748         NewToVT, Alignment.value(), MMOFlags, AAInfo);
14749     Stores.push_back(Store);
14750   }
14751   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
14752 }
14753 
14754 /// PerformSTORECombine - Target-specific dag combine xforms for
14755 /// ISD::STORE.
14756 static SDValue PerformSTORECombine(SDNode *N,
14757                                    TargetLowering::DAGCombinerInfo &DCI,
14758                                    const ARMSubtarget *Subtarget) {
14759   StoreSDNode *St = cast<StoreSDNode>(N);
14760   if (St->isVolatile())
14761     return SDValue();
14762   SDValue StVal = St->getValue();
14763   EVT VT = StVal.getValueType();
14764 
14765   if (Subtarget->hasNEON())
14766     if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))
14767       return Store;
14768 
14769   if (Subtarget->hasMVEIntegerOps())
14770     if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG))
14771       return NewToken;
14772 
14773   if (!ISD::isNormalStore(St))
14774     return SDValue();
14775 
14776   // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
14777   // ARM stores of arguments in the same cache line.
14778   if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
14779       StVal.getNode()->hasOneUse()) {
14780     SelectionDAG  &DAG = DCI.DAG;
14781     bool isBigEndian = DAG.getDataLayout().isBigEndian();
14782     SDLoc DL(St);
14783     SDValue BasePtr = St->getBasePtr();
14784     SDValue NewST1 = DAG.getStore(
14785         St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
14786         BasePtr, St->getPointerInfo(), St->getOriginalAlign(),
14787         St->getMemOperand()->getFlags());
14788 
14789     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
14790                                     DAG.getConstant(4, DL, MVT::i32));
14791     return DAG.getStore(NewST1.getValue(0), DL,
14792                         StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
14793                         OffsetPtr, St->getPointerInfo().getWithOffset(4),
14794                         St->getOriginalAlign(),
14795                         St->getMemOperand()->getFlags());
14796   }
14797 
14798   if (StVal.getValueType() == MVT::i64 &&
14799       StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
14800 
14801     // Bitcast an i64 store extracted from a vector to f64.
14802     // Otherwise, the i64 value will be legalized to a pair of i32 values.
14803     SelectionDAG &DAG = DCI.DAG;
14804     SDLoc dl(StVal);
14805     SDValue IntVec = StVal.getOperand(0);
14806     EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
14807                                    IntVec.getValueType().getVectorNumElements());
14808     SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
14809     SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
14810                                  Vec, StVal.getOperand(1));
14811     dl = SDLoc(N);
14812     SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
14813     // Make the DAGCombiner fold the bitcasts.
14814     DCI.AddToWorklist(Vec.getNode());
14815     DCI.AddToWorklist(ExtElt.getNode());
14816     DCI.AddToWorklist(V.getNode());
14817     return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
14818                         St->getPointerInfo(), St->getAlignment(),
14819                         St->getMemOperand()->getFlags(), St->getAAInfo());
14820   }
14821 
14822   // If this is a legal vector store, try to combine it into a VST1_UPD.
14823   if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() &&
14824       DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
14825     return CombineBaseUpdate(N, DCI);
14826 
14827   return SDValue();
14828 }
14829 
14830 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
14831 /// can replace combinations of VMUL and VCVT (floating-point to integer)
14832 /// when the VMUL has a constant operand that is a power of 2.
14833 ///
14834 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
14835 ///  vmul.f32        d16, d17, d16
14836 ///  vcvt.s32.f32    d16, d16
14837 /// becomes:
14838 ///  vcvt.s32.f32    d16, d16, #3
14839 static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG,
14840                                   const ARMSubtarget *Subtarget) {
14841   if (!Subtarget->hasNEON())
14842     return SDValue();
14843 
14844   SDValue Op = N->getOperand(0);
14845   if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
14846       Op.getOpcode() != ISD::FMUL)
14847     return SDValue();
14848 
14849   SDValue ConstVec = Op->getOperand(1);
14850   if (!isa<BuildVectorSDNode>(ConstVec))
14851     return SDValue();
14852 
14853   MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
14854   uint32_t FloatBits = FloatTy.getSizeInBits();
14855   MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
14856   uint32_t IntBits = IntTy.getSizeInBits();
14857   unsigned NumLanes = Op.getValueType().getVectorNumElements();
14858   if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
14859     // These instructions only exist converting from f32 to i32. We can handle
14860     // smaller integers by generating an extra truncate, but larger ones would
14861     // be lossy. We also can't handle anything other than 2 or 4 lanes, since
14862     // these intructions only support v2i32/v4i32 types.
14863     return SDValue();
14864   }
14865 
14866   BitVector UndefElements;
14867   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
14868   int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
14869   if (C == -1 || C == 0 || C > 32)
14870     return SDValue();
14871 
14872   SDLoc dl(N);
14873   bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
14874   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
14875     Intrinsic::arm_neon_vcvtfp2fxu;
14876   SDValue FixConv = DAG.getNode(
14877       ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
14878       DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
14879       DAG.getConstant(C, dl, MVT::i32));
14880 
14881   if (IntBits < FloatBits)
14882     FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
14883 
14884   return FixConv;
14885 }
14886 
14887 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
14888 /// can replace combinations of VCVT (integer to floating-point) and VDIV
14889 /// when the VDIV has a constant operand that is a power of 2.
14890 ///
14891 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
14892 ///  vcvt.f32.s32    d16, d16
14893 ///  vdiv.f32        d16, d17, d16
14894 /// becomes:
14895 ///  vcvt.f32.s32    d16, d16, #3
14896 static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
14897                                   const ARMSubtarget *Subtarget) {
14898   if (!Subtarget->hasNEON())
14899     return SDValue();
14900 
14901   SDValue Op = N->getOperand(0);
14902   unsigned OpOpcode = Op.getNode()->getOpcode();
14903   if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
14904       (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
14905     return SDValue();
14906 
14907   SDValue ConstVec = N->getOperand(1);
14908   if (!isa<BuildVectorSDNode>(ConstVec))
14909     return SDValue();
14910 
14911   MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
14912   uint32_t FloatBits = FloatTy.getSizeInBits();
14913   MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
14914   uint32_t IntBits = IntTy.getSizeInBits();
14915   unsigned NumLanes = Op.getValueType().getVectorNumElements();
14916   if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
14917     // These instructions only exist converting from i32 to f32. We can handle
14918     // smaller integers by generating an extra extend, but larger ones would
14919     // be lossy. We also can't handle anything other than 2 or 4 lanes, since
14920     // these intructions only support v2i32/v4i32 types.
14921     return SDValue();
14922   }
14923 
14924   BitVector UndefElements;
14925   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
14926   int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
14927   if (C == -1 || C == 0 || C > 32)
14928     return SDValue();
14929 
14930   SDLoc dl(N);
14931   bool isSigned = OpOpcode == ISD::SINT_TO_FP;
14932   SDValue ConvInput = Op.getOperand(0);
14933   if (IntBits < FloatBits)
14934     ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
14935                             dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
14936                             ConvInput);
14937 
14938   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
14939     Intrinsic::arm_neon_vcvtfxu2fp;
14940   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
14941                      Op.getValueType(),
14942                      DAG.getConstant(IntrinsicOpcode, dl, MVT::i32),
14943                      ConvInput, DAG.getConstant(C, dl, MVT::i32));
14944 }
14945 
14946 static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG,
14947                                            const ARMSubtarget *ST) {
14948   if (!ST->hasMVEIntegerOps())
14949     return SDValue();
14950 
14951   assert(N->getOpcode() == ISD::VECREDUCE_ADD);
14952   EVT ResVT = N->getValueType(0);
14953   SDValue N0 = N->getOperand(0);
14954   SDLoc dl(N);
14955 
14956   // We are looking for something that will have illegal types if left alone,
14957   // but that we can convert to a single instruction undef MVE. For example
14958   // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A
14959   // or
14960   // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B
14961 
14962   // Cases:
14963   //   VADDV u/s 8/16/32
14964   //   VMLAV u/s 8/16/32
14965   //   VADDLV u/s 32
14966   //   VMLALV u/s 16/32
14967 
14968   // If the input vector is smaller than legal (v4i8/v4i16 for example) we can
14969   // extend it and use v4i32 instead.
14970   auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) {
14971     EVT AVT = A.getValueType();
14972     if (!AVT.is128BitVector())
14973       A = DAG.getNode(ExtendCode, dl,
14974                       AVT.changeVectorElementType(MVT::getIntegerVT(
14975                           128 / AVT.getVectorMinNumElements())),
14976                       A);
14977     return A;
14978   };
14979   auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) {
14980     if (ResVT != RetTy || N0->getOpcode() != ExtendCode)
14981       return SDValue();
14982     SDValue A = N0->getOperand(0);
14983     if (llvm::any_of(ExtTypes, [&A](MVT Ty) { return A.getValueType() == Ty; }))
14984       return ExtendIfNeeded(A, ExtendCode);
14985     return SDValue();
14986   };
14987   auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode,
14988                          ArrayRef<MVT> ExtTypes, SDValue &Mask) {
14989     if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
14990         !ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode()))
14991       return SDValue();
14992     Mask = N0->getOperand(0);
14993     SDValue Ext = N0->getOperand(1);
14994     if (Ext->getOpcode() != ExtendCode)
14995       return SDValue();
14996     SDValue A = Ext->getOperand(0);
14997     if (llvm::any_of(ExtTypes, [&A](MVT Ty) { return A.getValueType() == Ty; }))
14998       return ExtendIfNeeded(A, ExtendCode);
14999     return SDValue();
15000   };
15001   auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
15002                      SDValue &A, SDValue &B) {
15003     // For a vmla we are trying to match a larger pattern:
15004     // ExtA = sext/zext A
15005     // ExtB = sext/zext B
15006     // Mul = mul ExtA, ExtB
15007     // vecreduce.add Mul
15008     // There might also be en extra extend between the mul and the addreduce, so
15009     // long as the bitwidth is high enough to make them equivalent (for example
15010     // original v8i16 might be mul at v8i32 and the reduce happens at v8i64).
15011     if (ResVT != RetTy)
15012       return false;
15013     SDValue Mul = N0;
15014     if (Mul->getOpcode() == ExtendCode &&
15015         Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
15016             ResVT.getScalarSizeInBits())
15017       Mul = Mul->getOperand(0);
15018     if (Mul->getOpcode() != ISD::MUL)
15019       return false;
15020     SDValue ExtA = Mul->getOperand(0);
15021     SDValue ExtB = Mul->getOperand(1);
15022     if (ExtA->getOpcode() != ExtendCode && ExtB->getOpcode() != ExtendCode)
15023       return false;
15024     A = ExtA->getOperand(0);
15025     B = ExtB->getOperand(0);
15026     if (A.getValueType() == B.getValueType() &&
15027         llvm::any_of(ExtTypes,
15028                      [&A](MVT Ty) { return A.getValueType() == Ty; })) {
15029       A = ExtendIfNeeded(A, ExtendCode);
15030       B = ExtendIfNeeded(B, ExtendCode);
15031       return true;
15032     }
15033     return false;
15034   };
15035   auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
15036                      SDValue &A, SDValue &B, SDValue &Mask) {
15037     // Same as the pattern above with a select for the zero predicated lanes
15038     // ExtA = sext/zext A
15039     // ExtB = sext/zext B
15040     // Mul = mul ExtA, ExtB
15041     // N0 = select Mask, Mul, 0
15042     // vecreduce.add N0
15043     if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
15044         !ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode()))
15045       return false;
15046     Mask = N0->getOperand(0);
15047     SDValue Mul = N0->getOperand(1);
15048     if (Mul->getOpcode() == ExtendCode &&
15049         Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
15050             ResVT.getScalarSizeInBits())
15051       Mul = Mul->getOperand(0);
15052     if (Mul->getOpcode() != ISD::MUL)
15053       return false;
15054     SDValue ExtA = Mul->getOperand(0);
15055     SDValue ExtB = Mul->getOperand(1);
15056     if (ExtA->getOpcode() != ExtendCode && ExtB->getOpcode() != ExtendCode)
15057       return false;
15058     A = ExtA->getOperand(0);
15059     B = ExtB->getOperand(0);
15060     if (A.getValueType() == B.getValueType() &&
15061         llvm::any_of(ExtTypes,
15062                      [&A](MVT Ty) { return A.getValueType() == Ty; })) {
15063       A = ExtendIfNeeded(A, ExtendCode);
15064       B = ExtendIfNeeded(B, ExtendCode);
15065       return true;
15066     }
15067     return false;
15068   };
15069   auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) {
15070     SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops);
15071     return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node,
15072                        SDValue(Node.getNode(), 1));
15073   };
15074 
15075   if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}))
15076     return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);
15077   if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))
15078     return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);
15079   if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND,
15080                           {MVT::v4i8, MVT::v4i16, MVT::v4i32}))
15081     return Create64bitNode(ARMISD::VADDLVs, {A});
15082   if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND,
15083                           {MVT::v4i8, MVT::v4i16, MVT::v4i32}))
15084     return Create64bitNode(ARMISD::VADDLVu, {A});
15085   if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}))
15086     return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
15087                        DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A));
15088   if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}))
15089     return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
15090                        DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A));
15091 
15092   SDValue Mask;
15093   if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
15094     return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask);
15095   if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
15096     return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask);
15097   if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND,
15098                               {MVT::v4i8, MVT::v4i16, MVT::v4i32}, Mask))
15099     return Create64bitNode(ARMISD::VADDLVps, {A, Mask});
15100   if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND,
15101                               {MVT::v4i8, MVT::v4i16, MVT::v4i32}, Mask))
15102     return Create64bitNode(ARMISD::VADDLVpu, {A, Mask});
15103   if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask))
15104     return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
15105                        DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask));
15106   if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask))
15107     return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
15108                        DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask));
15109 
15110   SDValue A, B;
15111   if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
15112     return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B);
15113   if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
15114     return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B);
15115   if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND,
15116               {MVT::v8i8, MVT::v8i16, MVT::v4i8, MVT::v4i16, MVT::v4i32}, A, B))
15117     return Create64bitNode(ARMISD::VMLALVs, {A, B});
15118   if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND,
15119               {MVT::v8i8, MVT::v8i16, MVT::v4i8, MVT::v4i16, MVT::v4i32}, A, B))
15120     return Create64bitNode(ARMISD::VMLALVu, {A, B});
15121   if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B))
15122     return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
15123                        DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B));
15124   if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B))
15125     return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
15126                        DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B));
15127 
15128   if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B, Mask))
15129     return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask);
15130   if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B, Mask))
15131     return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask);
15132   if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND,
15133                   {MVT::v8i8, MVT::v8i16, MVT::v4i8, MVT::v4i16, MVT::v4i32}, A,
15134                   B, Mask))
15135     return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask});
15136   if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND,
15137                   {MVT::v8i8, MVT::v8i16, MVT::v4i8, MVT::v4i16, MVT::v4i32}, A,
15138                   B, Mask))
15139     return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask});
15140   if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask))
15141     return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
15142                        DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask));
15143   if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask))
15144     return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
15145                        DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask));
15146 
15147   // Some complications. We can get a case where the two inputs of the mul are
15148   // the same, then the output sext will have been helpfully converted to a
15149   // zext. Turn it back.
15150   SDValue Op = N0;
15151   if (Op->getOpcode() == ISD::VSELECT)
15152     Op = Op->getOperand(1);
15153   if (Op->getOpcode() == ISD::ZERO_EXTEND &&
15154       Op->getOperand(0)->getOpcode() == ISD::MUL) {
15155     SDValue Mul = Op->getOperand(0);
15156     if (Mul->getOperand(0) == Mul->getOperand(1) &&
15157         Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) {
15158       SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul);
15159       if (Op != N0)
15160         Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0),
15161                           N0->getOperand(0), Ext, N0->getOperand(2));
15162       return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext);
15163     }
15164   }
15165 
15166   return SDValue();
15167 }
15168 
15169 static SDValue PerformVMOVNCombine(SDNode *N,
15170                                    TargetLowering::DAGCombinerInfo &DCI) {
15171   SDValue Op0 = N->getOperand(0);
15172   SDValue Op1 = N->getOperand(1);
15173   unsigned IsTop = N->getConstantOperandVal(2);
15174 
15175   // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b)
15176   // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b)
15177   if ((Op1->getOpcode() == ARMISD::VQMOVNs ||
15178        Op1->getOpcode() == ARMISD::VQMOVNu) &&
15179       Op1->getConstantOperandVal(2) == 0)
15180     return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0),
15181                            Op0, Op1->getOperand(1), N->getOperand(2));
15182 
15183   // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from
15184   // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting
15185   // into the top or bottom lanes.
15186   unsigned NumElts = N->getValueType(0).getVectorNumElements();
15187   APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1));
15188   APInt Op0DemandedElts =
15189       IsTop ? Op1DemandedElts
15190             : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1));
15191 
15192   APInt KnownUndef, KnownZero;
15193   const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15194   if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, KnownUndef,
15195                                      KnownZero, DCI))
15196     return SDValue(N, 0);
15197   if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, KnownUndef,
15198                                      KnownZero, DCI))
15199     return SDValue(N, 0);
15200 
15201   return SDValue();
15202 }
15203 
15204 static SDValue PerformVQMOVNCombine(SDNode *N,
15205                                     TargetLowering::DAGCombinerInfo &DCI) {
15206   SDValue Op0 = N->getOperand(0);
15207   unsigned IsTop = N->getConstantOperandVal(2);
15208 
15209   unsigned NumElts = N->getValueType(0).getVectorNumElements();
15210   APInt Op0DemandedElts =
15211       APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
15212                                      : APInt::getHighBitsSet(2, 1));
15213 
15214   APInt KnownUndef, KnownZero;
15215   const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15216   if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, KnownUndef,
15217                                      KnownZero, DCI))
15218     return SDValue(N, 0);
15219   return SDValue();
15220 }
15221 
15222 static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG) {
15223   SDLoc DL(N);
15224   SDValue Op0 = N->getOperand(0);
15225   SDValue Op1 = N->getOperand(1);
15226 
15227   // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from
15228   // uses of the intrinsics.
15229   if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
15230     int ShiftAmt = C->getSExtValue();
15231     if (ShiftAmt == 0) {
15232       SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL);
15233       DAG.ReplaceAllUsesWith(N, Merge.getNode());
15234       return SDValue();
15235     }
15236 
15237     if (ShiftAmt >= -32 && ShiftAmt < 0) {
15238       unsigned NewOpcode =
15239           N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL;
15240       SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1,
15241                                      DAG.getConstant(-ShiftAmt, DL, MVT::i32));
15242       DAG.ReplaceAllUsesWith(N, NewShift.getNode());
15243       return NewShift;
15244     }
15245   }
15246 
15247   return SDValue();
15248 }
15249 
15250 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
15251 SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N,
15252                                                    DAGCombinerInfo &DCI) const {
15253   SelectionDAG &DAG = DCI.DAG;
15254   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
15255   switch (IntNo) {
15256   default:
15257     // Don't do anything for most intrinsics.
15258     break;
15259 
15260   // Vector shifts: check for immediate versions and lower them.
15261   // Note: This is done during DAG combining instead of DAG legalizing because
15262   // the build_vectors for 64-bit vector element shift counts are generally
15263   // not legal, and it is hard to see their values after they get legalized to
15264   // loads from a constant pool.
15265   case Intrinsic::arm_neon_vshifts:
15266   case Intrinsic::arm_neon_vshiftu:
15267   case Intrinsic::arm_neon_vrshifts:
15268   case Intrinsic::arm_neon_vrshiftu:
15269   case Intrinsic::arm_neon_vrshiftn:
15270   case Intrinsic::arm_neon_vqshifts:
15271   case Intrinsic::arm_neon_vqshiftu:
15272   case Intrinsic::arm_neon_vqshiftsu:
15273   case Intrinsic::arm_neon_vqshiftns:
15274   case Intrinsic::arm_neon_vqshiftnu:
15275   case Intrinsic::arm_neon_vqshiftnsu:
15276   case Intrinsic::arm_neon_vqrshiftns:
15277   case Intrinsic::arm_neon_vqrshiftnu:
15278   case Intrinsic::arm_neon_vqrshiftnsu: {
15279     EVT VT = N->getOperand(1).getValueType();
15280     int64_t Cnt;
15281     unsigned VShiftOpc = 0;
15282 
15283     switch (IntNo) {
15284     case Intrinsic::arm_neon_vshifts:
15285     case Intrinsic::arm_neon_vshiftu:
15286       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
15287         VShiftOpc = ARMISD::VSHLIMM;
15288         break;
15289       }
15290       if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
15291         VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM
15292                                                           : ARMISD::VSHRuIMM);
15293         break;
15294       }
15295       return SDValue();
15296 
15297     case Intrinsic::arm_neon_vrshifts:
15298     case Intrinsic::arm_neon_vrshiftu:
15299       if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
15300         break;
15301       return SDValue();
15302 
15303     case Intrinsic::arm_neon_vqshifts:
15304     case Intrinsic::arm_neon_vqshiftu:
15305       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
15306         break;
15307       return SDValue();
15308 
15309     case Intrinsic::arm_neon_vqshiftsu:
15310       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
15311         break;
15312       llvm_unreachable("invalid shift count for vqshlu intrinsic");
15313 
15314     case Intrinsic::arm_neon_vrshiftn:
15315     case Intrinsic::arm_neon_vqshiftns:
15316     case Intrinsic::arm_neon_vqshiftnu:
15317     case Intrinsic::arm_neon_vqshiftnsu:
15318     case Intrinsic::arm_neon_vqrshiftns:
15319     case Intrinsic::arm_neon_vqrshiftnu:
15320     case Intrinsic::arm_neon_vqrshiftnsu:
15321       // Narrowing shifts require an immediate right shift.
15322       if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
15323         break;
15324       llvm_unreachable("invalid shift count for narrowing vector shift "
15325                        "intrinsic");
15326 
15327     default:
15328       llvm_unreachable("unhandled vector shift");
15329     }
15330 
15331     switch (IntNo) {
15332     case Intrinsic::arm_neon_vshifts:
15333     case Intrinsic::arm_neon_vshiftu:
15334       // Opcode already set above.
15335       break;
15336     case Intrinsic::arm_neon_vrshifts:
15337       VShiftOpc = ARMISD::VRSHRsIMM;
15338       break;
15339     case Intrinsic::arm_neon_vrshiftu:
15340       VShiftOpc = ARMISD::VRSHRuIMM;
15341       break;
15342     case Intrinsic::arm_neon_vrshiftn:
15343       VShiftOpc = ARMISD::VRSHRNIMM;
15344       break;
15345     case Intrinsic::arm_neon_vqshifts:
15346       VShiftOpc = ARMISD::VQSHLsIMM;
15347       break;
15348     case Intrinsic::arm_neon_vqshiftu:
15349       VShiftOpc = ARMISD::VQSHLuIMM;
15350       break;
15351     case Intrinsic::arm_neon_vqshiftsu:
15352       VShiftOpc = ARMISD::VQSHLsuIMM;
15353       break;
15354     case Intrinsic::arm_neon_vqshiftns:
15355       VShiftOpc = ARMISD::VQSHRNsIMM;
15356       break;
15357     case Intrinsic::arm_neon_vqshiftnu:
15358       VShiftOpc = ARMISD::VQSHRNuIMM;
15359       break;
15360     case Intrinsic::arm_neon_vqshiftnsu:
15361       VShiftOpc = ARMISD::VQSHRNsuIMM;
15362       break;
15363     case Intrinsic::arm_neon_vqrshiftns:
15364       VShiftOpc = ARMISD::VQRSHRNsIMM;
15365       break;
15366     case Intrinsic::arm_neon_vqrshiftnu:
15367       VShiftOpc = ARMISD::VQRSHRNuIMM;
15368       break;
15369     case Intrinsic::arm_neon_vqrshiftnsu:
15370       VShiftOpc = ARMISD::VQRSHRNsuIMM;
15371       break;
15372     }
15373 
15374     SDLoc dl(N);
15375     return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
15376                        N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
15377   }
15378 
15379   case Intrinsic::arm_neon_vshiftins: {
15380     EVT VT = N->getOperand(1).getValueType();
15381     int64_t Cnt;
15382     unsigned VShiftOpc = 0;
15383 
15384     if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
15385       VShiftOpc = ARMISD::VSLIIMM;
15386     else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
15387       VShiftOpc = ARMISD::VSRIIMM;
15388     else {
15389       llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
15390     }
15391 
15392     SDLoc dl(N);
15393     return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
15394                        N->getOperand(1), N->getOperand(2),
15395                        DAG.getConstant(Cnt, dl, MVT::i32));
15396   }
15397 
15398   case Intrinsic::arm_neon_vqrshifts:
15399   case Intrinsic::arm_neon_vqrshiftu:
15400     // No immediate versions of these to check for.
15401     break;
15402 
15403   case Intrinsic::arm_mve_vqdmlah:
15404   case Intrinsic::arm_mve_vqdmlash:
15405   case Intrinsic::arm_mve_vqrdmlah:
15406   case Intrinsic::arm_mve_vqrdmlash:
15407   case Intrinsic::arm_mve_vmla_n_predicated:
15408   case Intrinsic::arm_mve_vmlas_n_predicated:
15409   case Intrinsic::arm_mve_vqdmlah_predicated:
15410   case Intrinsic::arm_mve_vqdmlash_predicated:
15411   case Intrinsic::arm_mve_vqrdmlah_predicated:
15412   case Intrinsic::arm_mve_vqrdmlash_predicated: {
15413     // These intrinsics all take an i32 scalar operand which is narrowed to the
15414     // size of a single lane of the vector type they return. So we don't need
15415     // any bits of that operand above that point, which allows us to eliminate
15416     // uxth/sxth.
15417     unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
15418     APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
15419     if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI))
15420       return SDValue();
15421     break;
15422   }
15423 
15424   case Intrinsic::arm_mve_minv:
15425   case Intrinsic::arm_mve_maxv:
15426   case Intrinsic::arm_mve_minav:
15427   case Intrinsic::arm_mve_maxav:
15428   case Intrinsic::arm_mve_minv_predicated:
15429   case Intrinsic::arm_mve_maxv_predicated:
15430   case Intrinsic::arm_mve_minav_predicated:
15431   case Intrinsic::arm_mve_maxav_predicated: {
15432     // These intrinsics all take an i32 scalar operand which is narrowed to the
15433     // size of a single lane of the vector type they take as the other input.
15434     unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits();
15435     APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
15436     if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
15437       return SDValue();
15438     break;
15439   }
15440 
15441   case Intrinsic::arm_mve_addv: {
15442     // Turn this intrinsic straight into the appropriate ARMISD::VADDV node,
15443     // which allow PerformADDVecReduce to turn it into VADDLV when possible.
15444     bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
15445     unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs;
15446     return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1));
15447   }
15448 
15449   case Intrinsic::arm_mve_addlv:
15450   case Intrinsic::arm_mve_addlv_predicated: {
15451     // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR
15452     // which recombines the two outputs into an i64
15453     bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
15454     unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ?
15455                     (Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) :
15456                     (Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps);
15457 
15458     SmallVector<SDValue, 4> Ops;
15459     for (unsigned i = 1, e = N->getNumOperands(); i < e; i++)
15460       if (i != 2)                      // skip the unsigned flag
15461         Ops.push_back(N->getOperand(i));
15462 
15463     SDLoc dl(N);
15464     SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops);
15465     return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0),
15466                        val.getValue(1));
15467   }
15468   }
15469 
15470   return SDValue();
15471 }
15472 
15473 /// PerformShiftCombine - Checks for immediate versions of vector shifts and
15474 /// lowers them.  As with the vector shift intrinsics, this is done during DAG
15475 /// combining instead of DAG legalizing because the build_vectors for 64-bit
15476 /// vector element shift counts are generally not legal, and it is hard to see
15477 /// their values after they get legalized to loads from a constant pool.
15478 static SDValue PerformShiftCombine(SDNode *N,
15479                                    TargetLowering::DAGCombinerInfo &DCI,
15480                                    const ARMSubtarget *ST) {
15481   SelectionDAG &DAG = DCI.DAG;
15482   EVT VT = N->getValueType(0);
15483   if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) {
15484     // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high
15485     // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16.
15486     SDValue N1 = N->getOperand(1);
15487     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
15488       SDValue N0 = N->getOperand(0);
15489       if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP &&
15490           DAG.MaskedValueIsZero(N0.getOperand(0),
15491                                 APInt::getHighBitsSet(32, 16)))
15492         return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1);
15493     }
15494   }
15495 
15496   if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
15497       N->getOperand(0)->getOpcode() == ISD::AND &&
15498       N->getOperand(0)->hasOneUse()) {
15499     if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
15500       return SDValue();
15501     // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
15502     // usually show up because instcombine prefers to canonicalize it to
15503     // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
15504     // out of GEP lowering in some cases.
15505     SDValue N0 = N->getOperand(0);
15506     ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
15507     if (!ShiftAmtNode)
15508       return SDValue();
15509     uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());
15510     ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1));
15511     if (!AndMaskNode)
15512       return SDValue();
15513     uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());
15514     // Don't transform uxtb/uxth.
15515     if (AndMask == 255 || AndMask == 65535)
15516       return SDValue();
15517     if (isMask_32(AndMask)) {
15518       uint32_t MaskedBits = countLeadingZeros(AndMask);
15519       if (MaskedBits > ShiftAmt) {
15520         SDLoc DL(N);
15521         SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
15522                                   DAG.getConstant(MaskedBits, DL, MVT::i32));
15523         return DAG.getNode(
15524             ISD::SRL, DL, MVT::i32, SHL,
15525             DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));
15526       }
15527     }
15528   }
15529 
15530   // Nothing to be done for scalar shifts.
15531   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15532   if (!VT.isVector() || !TLI.isTypeLegal(VT))
15533     return SDValue();
15534   if (ST->hasMVEIntegerOps() && VT == MVT::v2i64)
15535     return SDValue();
15536 
15537   int64_t Cnt;
15538 
15539   switch (N->getOpcode()) {
15540   default: llvm_unreachable("unexpected shift opcode");
15541 
15542   case ISD::SHL:
15543     if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
15544       SDLoc dl(N);
15545       return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
15546                          DAG.getConstant(Cnt, dl, MVT::i32));
15547     }
15548     break;
15549 
15550   case ISD::SRA:
15551   case ISD::SRL:
15552     if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
15553       unsigned VShiftOpc =
15554           (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
15555       SDLoc dl(N);
15556       return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
15557                          DAG.getConstant(Cnt, dl, MVT::i32));
15558     }
15559   }
15560   return SDValue();
15561 }
15562 
15563 // Look for a sign/zero/fpextend extend of a larger than legal load. This can be
15564 // split into multiple extending loads, which are simpler to deal with than an
15565 // arbitrary extend. For fp extends we use an integer extending load and a VCVTL
15566 // to convert the type to an f32.
15567 static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) {
15568   SDValue N0 = N->getOperand(0);
15569   if (N0.getOpcode() != ISD::LOAD)
15570     return SDValue();
15571   LoadSDNode *LD = cast<LoadSDNode>(N0.getNode());
15572   if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() ||
15573       LD->getExtensionType() != ISD::NON_EXTLOAD)
15574     return SDValue();
15575   EVT FromVT = LD->getValueType(0);
15576   EVT ToVT = N->getValueType(0);
15577   if (!ToVT.isVector())
15578     return SDValue();
15579   assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements());
15580   EVT ToEltVT = ToVT.getVectorElementType();
15581   EVT FromEltVT = FromVT.getVectorElementType();
15582 
15583   unsigned NumElements = 0;
15584   if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
15585     NumElements = 4;
15586   if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
15587     NumElements = 8;
15588   if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)
15589     NumElements = 4;
15590   if (NumElements == 0 ||
15591       (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) ||
15592       FromVT.getVectorNumElements() % NumElements != 0 ||
15593       !isPowerOf2_32(NumElements))
15594     return SDValue();
15595 
15596   LLVMContext &C = *DAG.getContext();
15597   SDLoc DL(LD);
15598   // Details about the old load
15599   SDValue Ch = LD->getChain();
15600   SDValue BasePtr = LD->getBasePtr();
15601   Align Alignment = LD->getOriginalAlign();
15602   MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
15603   AAMDNodes AAInfo = LD->getAAInfo();
15604 
15605   ISD::LoadExtType NewExtType =
15606       N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
15607   SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
15608   EVT NewFromVT = EVT::getVectorVT(
15609       C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
15610   EVT NewToVT = EVT::getVectorVT(
15611       C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
15612 
15613   SmallVector<SDValue, 4> Loads;
15614   SmallVector<SDValue, 4> Chains;
15615   for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
15616     unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
15617     SDValue NewPtr =
15618         DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset));
15619 
15620     SDValue NewLoad =
15621         DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
15622                     LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
15623                     Alignment, MMOFlags, AAInfo);
15624     Loads.push_back(NewLoad);
15625     Chains.push_back(SDValue(NewLoad.getNode(), 1));
15626   }
15627 
15628   // Float truncs need to extended with VCVTB's into their floating point types.
15629   if (FromEltVT == MVT::f16) {
15630     SmallVector<SDValue, 4> Extends;
15631 
15632     for (unsigned i = 0; i < Loads.size(); i++) {
15633       SDValue LoadBC =
15634           DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]);
15635       SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC,
15636                                   DAG.getConstant(0, DL, MVT::i32));
15637       Extends.push_back(FPExt);
15638     }
15639 
15640     Loads = Extends;
15641   }
15642 
15643   SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
15644   DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
15645   return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads);
15646 }
15647 
15648 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
15649 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
15650 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
15651                                     const ARMSubtarget *ST) {
15652   SDValue N0 = N->getOperand(0);
15653 
15654   // Check for sign- and zero-extensions of vector extract operations of 8- and
15655   // 16-bit vector elements. NEON and MVE support these directly. They are
15656   // handled during DAG combining because type legalization will promote them
15657   // to 32-bit types and it is messy to recognize the operations after that.
15658   if ((ST->hasNEON() || ST->hasMVEIntegerOps()) &&
15659       N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
15660     SDValue Vec = N0.getOperand(0);
15661     SDValue Lane = N0.getOperand(1);
15662     EVT VT = N->getValueType(0);
15663     EVT EltVT = N0.getValueType();
15664     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15665 
15666     if (VT == MVT::i32 &&
15667         (EltVT == MVT::i8 || EltVT == MVT::i16) &&
15668         TLI.isTypeLegal(Vec.getValueType()) &&
15669         isa<ConstantSDNode>(Lane)) {
15670 
15671       unsigned Opc = 0;
15672       switch (N->getOpcode()) {
15673       default: llvm_unreachable("unexpected opcode");
15674       case ISD::SIGN_EXTEND:
15675         Opc = ARMISD::VGETLANEs;
15676         break;
15677       case ISD::ZERO_EXTEND:
15678       case ISD::ANY_EXTEND:
15679         Opc = ARMISD::VGETLANEu;
15680         break;
15681       }
15682       return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
15683     }
15684   }
15685 
15686   if (ST->hasMVEIntegerOps())
15687     if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
15688       return NewLoad;
15689 
15690   return SDValue();
15691 }
15692 
15693 static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG,
15694                                       const ARMSubtarget *ST) {
15695   if (ST->hasMVEFloatOps())
15696     if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
15697       return NewLoad;
15698 
15699   return SDValue();
15700 }
15701 
15702 /// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
15703 /// saturates.
15704 static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG,
15705                                     const ARMSubtarget *ST) {
15706   EVT VT = N->getValueType(0);
15707   SDValue N0 = N->getOperand(0);
15708   if (!ST->hasMVEIntegerOps())
15709     return SDValue();
15710 
15711   if (SDValue V = PerformVQDMULHCombine(N, DAG))
15712     return V;
15713 
15714   if (VT != MVT::v4i32 && VT != MVT::v8i16)
15715     return SDValue();
15716 
15717   auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) {
15718     // Check one is a smin and the other is a smax
15719     if (Min->getOpcode() != ISD::SMIN)
15720       std::swap(Min, Max);
15721     if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX)
15722       return false;
15723 
15724     APInt SaturateC;
15725     if (VT == MVT::v4i32)
15726       SaturateC = APInt(32, (1 << 15) - 1, true);
15727     else //if (VT == MVT::v8i16)
15728       SaturateC = APInt(16, (1 << 7) - 1, true);
15729 
15730     APInt MinC, MaxC;
15731     if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
15732         MinC != SaturateC)
15733       return false;
15734     if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) ||
15735         MaxC != ~SaturateC)
15736       return false;
15737     return true;
15738   };
15739 
15740   if (IsSignedSaturate(N, N0.getNode())) {
15741     SDLoc DL(N);
15742     MVT ExtVT, HalfVT;
15743     if (VT == MVT::v4i32) {
15744       HalfVT = MVT::v8i16;
15745       ExtVT = MVT::v4i16;
15746     } else { // if (VT == MVT::v8i16)
15747       HalfVT = MVT::v16i8;
15748       ExtVT = MVT::v8i8;
15749     }
15750 
15751     // Create a VQMOVNB with undef top lanes, then signed extended into the top
15752     // half. That extend will hopefully be removed if only the bottom bits are
15753     // demanded (though a truncating store, for example).
15754     SDValue VQMOVN =
15755         DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT),
15756                     N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32));
15757     SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
15758     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast,
15759                        DAG.getValueType(ExtVT));
15760   }
15761 
15762   auto IsUnsignedSaturate = [&](SDNode *Min) {
15763     // For unsigned, we just need to check for <= 0xffff
15764     if (Min->getOpcode() != ISD::UMIN)
15765       return false;
15766 
15767     APInt SaturateC;
15768     if (VT == MVT::v4i32)
15769       SaturateC = APInt(32, (1 << 16) - 1, true);
15770     else //if (VT == MVT::v8i16)
15771       SaturateC = APInt(16, (1 << 8) - 1, true);
15772 
15773     APInt MinC;
15774     if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
15775         MinC != SaturateC)
15776       return false;
15777     return true;
15778   };
15779 
15780   if (IsUnsignedSaturate(N)) {
15781     SDLoc DL(N);
15782     MVT HalfVT;
15783     unsigned ExtConst;
15784     if (VT == MVT::v4i32) {
15785       HalfVT = MVT::v8i16;
15786       ExtConst = 0x0000FFFF;
15787     } else { //if (VT == MVT::v8i16)
15788       HalfVT = MVT::v16i8;
15789       ExtConst = 0x00FF;
15790     }
15791 
15792     // Create a VQMOVNB with undef top lanes, then ZExt into the top half with
15793     // an AND. That extend will hopefully be removed if only the bottom bits are
15794     // demanded (though a truncating store, for example).
15795     SDValue VQMOVN =
15796         DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0,
15797                     DAG.getConstant(0, DL, MVT::i32));
15798     SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
15799     return DAG.getNode(ISD::AND, DL, VT, Bitcast,
15800                        DAG.getConstant(ExtConst, DL, VT));
15801   }
15802 
15803   return SDValue();
15804 }
15805 
15806 static const APInt *isPowerOf2Constant(SDValue V) {
15807   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
15808   if (!C)
15809     return nullptr;
15810   const APInt *CV = &C->getAPIntValue();
15811   return CV->isPowerOf2() ? CV : nullptr;
15812 }
15813 
15814 SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const {
15815   // If we have a CMOV, OR and AND combination such as:
15816   //   if (x & CN)
15817   //     y |= CM;
15818   //
15819   // And:
15820   //   * CN is a single bit;
15821   //   * All bits covered by CM are known zero in y
15822   //
15823   // Then we can convert this into a sequence of BFI instructions. This will
15824   // always be a win if CM is a single bit, will always be no worse than the
15825   // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
15826   // three bits (due to the extra IT instruction).
15827 
15828   SDValue Op0 = CMOV->getOperand(0);
15829   SDValue Op1 = CMOV->getOperand(1);
15830   auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2));
15831   auto CC = CCNode->getAPIntValue().getLimitedValue();
15832   SDValue CmpZ = CMOV->getOperand(4);
15833 
15834   // The compare must be against zero.
15835   if (!isNullConstant(CmpZ->getOperand(1)))
15836     return SDValue();
15837 
15838   assert(CmpZ->getOpcode() == ARMISD::CMPZ);
15839   SDValue And = CmpZ->getOperand(0);
15840   if (And->getOpcode() != ISD::AND)
15841     return SDValue();
15842   const APInt *AndC = isPowerOf2Constant(And->getOperand(1));
15843   if (!AndC)
15844     return SDValue();
15845   SDValue X = And->getOperand(0);
15846 
15847   if (CC == ARMCC::EQ) {
15848     // We're performing an "equal to zero" compare. Swap the operands so we
15849     // canonicalize on a "not equal to zero" compare.
15850     std::swap(Op0, Op1);
15851   } else {
15852     assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
15853   }
15854 
15855   if (Op1->getOpcode() != ISD::OR)
15856     return SDValue();
15857 
15858   ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1));
15859   if (!OrC)
15860     return SDValue();
15861   SDValue Y = Op1->getOperand(0);
15862 
15863   if (Op0 != Y)
15864     return SDValue();
15865 
15866   // Now, is it profitable to continue?
15867   APInt OrCI = OrC->getAPIntValue();
15868   unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
15869   if (OrCI.countPopulation() > Heuristic)
15870     return SDValue();
15871 
15872   // Lastly, can we determine that the bits defined by OrCI
15873   // are zero in Y?
15874   KnownBits Known = DAG.computeKnownBits(Y);
15875   if ((OrCI & Known.Zero) != OrCI)
15876     return SDValue();
15877 
15878   // OK, we can do the combine.
15879   SDValue V = Y;
15880   SDLoc dl(X);
15881   EVT VT = X.getValueType();
15882   unsigned BitInX = AndC->logBase2();
15883 
15884   if (BitInX != 0) {
15885     // We must shift X first.
15886     X = DAG.getNode(ISD::SRL, dl, VT, X,
15887                     DAG.getConstant(BitInX, dl, VT));
15888   }
15889 
15890   for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
15891        BitInY < NumActiveBits; ++BitInY) {
15892     if (OrCI[BitInY] == 0)
15893       continue;
15894     APInt Mask(VT.getSizeInBits(), 0);
15895     Mask.setBit(BitInY);
15896     V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
15897                     // Confusingly, the operand is an *inverted* mask.
15898                     DAG.getConstant(~Mask, dl, VT));
15899   }
15900 
15901   return V;
15902 }
15903 
15904 // Given N, the value controlling the conditional branch, search for the loop
15905 // intrinsic, returning it, along with how the value is used. We need to handle
15906 // patterns such as the following:
15907 // (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)
15908 // (brcond (setcc (loop.decrement), 0, eq), exit)
15909 // (brcond (setcc (loop.decrement), 0, ne), header)
15910 static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm,
15911                                    bool &Negate) {
15912   switch (N->getOpcode()) {
15913   default:
15914     break;
15915   case ISD::XOR: {
15916     if (!isa<ConstantSDNode>(N.getOperand(1)))
15917       return SDValue();
15918     if (!cast<ConstantSDNode>(N.getOperand(1))->isOne())
15919       return SDValue();
15920     Negate = !Negate;
15921     return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate);
15922   }
15923   case ISD::SETCC: {
15924     auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));
15925     if (!Const)
15926       return SDValue();
15927     if (Const->isNullValue())
15928       Imm = 0;
15929     else if (Const->isOne())
15930       Imm = 1;
15931     else
15932       return SDValue();
15933     CC = cast<CondCodeSDNode>(N.getOperand(2))->get();
15934     return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);
15935   }
15936   case ISD::INTRINSIC_W_CHAIN: {
15937     unsigned IntOp = cast<ConstantSDNode>(N.getOperand(1))->getZExtValue();
15938     if (IntOp != Intrinsic::test_set_loop_iterations &&
15939         IntOp != Intrinsic::loop_decrement_reg)
15940       return SDValue();
15941     return N;
15942   }
15943   }
15944   return SDValue();
15945 }
15946 
15947 static SDValue PerformHWLoopCombine(SDNode *N,
15948                                     TargetLowering::DAGCombinerInfo &DCI,
15949                                     const ARMSubtarget *ST) {
15950 
15951   // The hwloop intrinsics that we're interested are used for control-flow,
15952   // either for entering or exiting the loop:
15953   // - test.set.loop.iterations will test whether its operand is zero. If it
15954   //   is zero, the proceeding branch should not enter the loop.
15955   // - loop.decrement.reg also tests whether its operand is zero. If it is
15956   //   zero, the proceeding branch should not branch back to the beginning of
15957   //   the loop.
15958   // So here, we need to check that how the brcond is using the result of each
15959   // of the intrinsics to ensure that we're branching to the right place at the
15960   // right time.
15961 
15962   ISD::CondCode CC;
15963   SDValue Cond;
15964   int Imm = 1;
15965   bool Negate = false;
15966   SDValue Chain = N->getOperand(0);
15967   SDValue Dest;
15968 
15969   if (N->getOpcode() == ISD::BRCOND) {
15970     CC = ISD::SETEQ;
15971     Cond = N->getOperand(1);
15972     Dest = N->getOperand(2);
15973   } else {
15974     assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!");
15975     CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
15976     Cond = N->getOperand(2);
15977     Dest = N->getOperand(4);
15978     if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {
15979       if (!Const->isOne() && !Const->isNullValue())
15980         return SDValue();
15981       Imm = Const->getZExtValue();
15982     } else
15983       return SDValue();
15984   }
15985 
15986   SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate);
15987   if (!Int)
15988     return SDValue();
15989 
15990   if (Negate)
15991     CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32);
15992 
15993   auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
15994     return (CC == ISD::SETEQ && Imm == 0) ||
15995            (CC == ISD::SETNE && Imm == 1) ||
15996            (CC == ISD::SETLT && Imm == 1) ||
15997            (CC == ISD::SETULT && Imm == 1);
15998   };
15999 
16000   auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) {
16001     return (CC == ISD::SETEQ && Imm == 1) ||
16002            (CC == ISD::SETNE && Imm == 0) ||
16003            (CC == ISD::SETGT && Imm == 0) ||
16004            (CC == ISD::SETUGT && Imm == 0) ||
16005            (CC == ISD::SETGE && Imm == 1) ||
16006            (CC == ISD::SETUGE && Imm == 1);
16007   };
16008 
16009   assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) &&
16010          "unsupported condition");
16011 
16012   SDLoc dl(Int);
16013   SelectionDAG &DAG = DCI.DAG;
16014   SDValue Elements = Int.getOperand(2);
16015   unsigned IntOp = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue();
16016   assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR)
16017           && "expected single br user");
16018   SDNode *Br = *N->use_begin();
16019   SDValue OtherTarget = Br->getOperand(1);
16020 
16021   // Update the unconditional branch to branch to the given Dest.
16022   auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) {
16023     SDValue NewBrOps[] = { Br->getOperand(0), Dest };
16024     SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps);
16025     DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr);
16026   };
16027 
16028   if (IntOp == Intrinsic::test_set_loop_iterations) {
16029     SDValue Res;
16030     // We expect this 'instruction' to branch when the counter is zero.
16031     if (IsTrueIfZero(CC, Imm)) {
16032       SDValue Ops[] = { Chain, Elements, Dest };
16033       Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
16034     } else {
16035       // The logic is the reverse of what we need for WLS, so find the other
16036       // basic block target: the target of the proceeding br.
16037       UpdateUncondBr(Br, Dest, DAG);
16038 
16039       SDValue Ops[] = { Chain, Elements, OtherTarget };
16040       Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
16041     }
16042     DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0));
16043     return Res;
16044   } else {
16045     SDValue Size = DAG.getTargetConstant(
16046       cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl, MVT::i32);
16047     SDValue Args[] = { Int.getOperand(0), Elements, Size, };
16048     SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,
16049                                   DAG.getVTList(MVT::i32, MVT::Other), Args);
16050     DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode());
16051 
16052     // We expect this instruction to branch when the count is not zero.
16053     SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget;
16054 
16055     // Update the unconditional branch to target the loop preheader if we've
16056     // found the condition has been reversed.
16057     if (Target == OtherTarget)
16058       UpdateUncondBr(Br, Dest, DAG);
16059 
16060     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
16061                         SDValue(LoopDec.getNode(), 1), Chain);
16062 
16063     SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target };
16064     return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs);
16065   }
16066   return SDValue();
16067 }
16068 
16069 /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
16070 SDValue
16071 ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const {
16072   SDValue Cmp = N->getOperand(4);
16073   if (Cmp.getOpcode() != ARMISD::CMPZ)
16074     // Only looking at NE cases.
16075     return SDValue();
16076 
16077   EVT VT = N->getValueType(0);
16078   SDLoc dl(N);
16079   SDValue LHS = Cmp.getOperand(0);
16080   SDValue RHS = Cmp.getOperand(1);
16081   SDValue Chain = N->getOperand(0);
16082   SDValue BB = N->getOperand(1);
16083   SDValue ARMcc = N->getOperand(2);
16084   ARMCC::CondCodes CC =
16085     (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
16086 
16087   // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0))
16088   // -> (brcond Chain BB CC CPSR Cmp)
16089   if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
16090       LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
16091       LHS->getOperand(0)->hasOneUse()) {
16092     auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0));
16093     auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1));
16094     auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
16095     auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
16096     if ((LHS00C && LHS00C->getZExtValue() == 0) &&
16097         (LHS01C && LHS01C->getZExtValue() == 1) &&
16098         (LHS1C && LHS1C->getZExtValue() == 1) &&
16099         (RHSC && RHSC->getZExtValue() == 0)) {
16100       return DAG.getNode(
16101           ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2),
16102           LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4));
16103     }
16104   }
16105 
16106   return SDValue();
16107 }
16108 
16109 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
16110 SDValue
16111 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
16112   SDValue Cmp = N->getOperand(4);
16113   if (Cmp.getOpcode() != ARMISD::CMPZ)
16114     // Only looking at EQ and NE cases.
16115     return SDValue();
16116 
16117   EVT VT = N->getValueType(0);
16118   SDLoc dl(N);
16119   SDValue LHS = Cmp.getOperand(0);
16120   SDValue RHS = Cmp.getOperand(1);
16121   SDValue FalseVal = N->getOperand(0);
16122   SDValue TrueVal = N->getOperand(1);
16123   SDValue ARMcc = N->getOperand(2);
16124   ARMCC::CondCodes CC =
16125     (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
16126 
16127   // BFI is only available on V6T2+.
16128   if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
16129     SDValue R = PerformCMOVToBFICombine(N, DAG);
16130     if (R)
16131       return R;
16132   }
16133 
16134   // Simplify
16135   //   mov     r1, r0
16136   //   cmp     r1, x
16137   //   mov     r0, y
16138   //   moveq   r0, x
16139   // to
16140   //   cmp     r0, x
16141   //   movne   r0, y
16142   //
16143   //   mov     r1, r0
16144   //   cmp     r1, x
16145   //   mov     r0, x
16146   //   movne   r0, y
16147   // to
16148   //   cmp     r0, x
16149   //   movne   r0, y
16150   /// FIXME: Turn this into a target neutral optimization?
16151   SDValue Res;
16152   if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
16153     Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc,
16154                       N->getOperand(3), Cmp);
16155   } else if (CC == ARMCC::EQ && TrueVal == RHS) {
16156     SDValue ARMcc;
16157     SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
16158     Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc,
16159                       N->getOperand(3), NewCmp);
16160   }
16161 
16162   // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0))
16163   // -> (cmov F T CC CPSR Cmp)
16164   if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) {
16165     auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0));
16166     auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
16167     auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
16168     if ((LHS0C && LHS0C->getZExtValue() == 0) &&
16169         (LHS1C && LHS1C->getZExtValue() == 1) &&
16170         (RHSC && RHSC->getZExtValue() == 0)) {
16171       return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
16172                          LHS->getOperand(2), LHS->getOperand(3),
16173                          LHS->getOperand(4));
16174     }
16175   }
16176 
16177   if (!VT.isInteger())
16178       return SDValue();
16179 
16180   // Materialize a boolean comparison for integers so we can avoid branching.
16181   if (isNullConstant(FalseVal)) {
16182     if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
16183       if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
16184         // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
16185         // right 5 bits will make that 32 be 1, otherwise it will be 0.
16186         // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
16187         SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
16188         Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
16189                           DAG.getConstant(5, dl, MVT::i32));
16190       } else {
16191         // CMOV 0, 1, ==, (CMPZ x, y) ->
16192         //     (ADDCARRY (SUB x, y), t:0, t:1)
16193         // where t = (SUBCARRY 0, (SUB x, y), 0)
16194         //
16195         // The SUBCARRY computes 0 - (x - y) and this will give a borrow when
16196         // x != y. In other words, a carry C == 1 when x == y, C == 0
16197         // otherwise.
16198         // The final ADDCARRY computes
16199         //     x - y + (0 - (x - y)) + C == C
16200         SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
16201         SDVTList VTs = DAG.getVTList(VT, MVT::i32);
16202         SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
16203         // ISD::SUBCARRY returns a borrow but we want the carry here
16204         // actually.
16205         SDValue Carry =
16206             DAG.getNode(ISD::SUB, dl, MVT::i32,
16207                         DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
16208         Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry);
16209       }
16210     } else if (CC == ARMCC::NE && !isNullConstant(RHS) &&
16211                (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
16212       // This seems pointless but will allow us to combine it further below.
16213       // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1
16214       SDValue Sub =
16215           DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
16216       SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
16217                                           Sub.getValue(1), SDValue());
16218       Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
16219                         N->getOperand(3), CPSRGlue.getValue(1));
16220       FalseVal = Sub;
16221     }
16222   } else if (isNullConstant(TrueVal)) {
16223     if (CC == ARMCC::EQ && !isNullConstant(RHS) &&
16224         (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
16225       // This seems pointless but will allow us to combine it further below
16226       // Note that we change == for != as this is the dual for the case above.
16227       // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1
16228       SDValue Sub =
16229           DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
16230       SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
16231                                           Sub.getValue(1), SDValue());
16232       Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
16233                         DAG.getConstant(ARMCC::NE, dl, MVT::i32),
16234                         N->getOperand(3), CPSRGlue.getValue(1));
16235       FalseVal = Sub;
16236     }
16237   }
16238 
16239   // On Thumb1, the DAG above may be further combined if z is a power of 2
16240   // (z == 2 ^ K).
16241   // CMOV (SUBS x, y), z, !=, (SUBS x, y):1 ->
16242   // t1 = (USUBO (SUB x, y), 1)
16243   // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1)
16244   // Result = if K != 0 then (SHL t2:0, K) else t2:0
16245   //
16246   // This also handles the special case of comparing against zero; it's
16247   // essentially, the same pattern, except there's no SUBS:
16248   // CMOV x, z, !=, (CMPZ x, 0) ->
16249   // t1 = (USUBO x, 1)
16250   // t2 = (SUBCARRY x, t1:0, t1:1)
16251   // Result = if K != 0 then (SHL t2:0, K) else t2:0
16252   const APInt *TrueConst;
16253   if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
16254       ((FalseVal.getOpcode() == ARMISD::SUBS &&
16255         FalseVal.getOperand(0) == LHS && FalseVal.getOperand(1) == RHS) ||
16256        (FalseVal == LHS && isNullConstant(RHS))) &&
16257       (TrueConst = isPowerOf2Constant(TrueVal))) {
16258     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
16259     unsigned ShiftAmount = TrueConst->logBase2();
16260     if (ShiftAmount)
16261       TrueVal = DAG.getConstant(1, dl, VT);
16262     SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
16263     Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1));
16264 
16265     if (ShiftAmount)
16266       Res = DAG.getNode(ISD::SHL, dl, VT, Res,
16267                         DAG.getConstant(ShiftAmount, dl, MVT::i32));
16268   }
16269 
16270   if (Res.getNode()) {
16271     KnownBits Known = DAG.computeKnownBits(SDValue(N,0));
16272     // Capture demanded bits information that would be otherwise lost.
16273     if (Known.Zero == 0xfffffffe)
16274       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
16275                         DAG.getValueType(MVT::i1));
16276     else if (Known.Zero == 0xffffff00)
16277       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
16278                         DAG.getValueType(MVT::i8));
16279     else if (Known.Zero == 0xffff0000)
16280       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
16281                         DAG.getValueType(MVT::i16));
16282   }
16283 
16284   return Res;
16285 }
16286 
16287 static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG,
16288                                     const ARMSubtarget *ST) {
16289   SDValue Src = N->getOperand(0);
16290   EVT DstVT = N->getValueType(0);
16291 
16292   // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE.
16293   if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) {
16294     EVT SrcVT = Src.getValueType();
16295     if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits())
16296       return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0));
16297   }
16298 
16299   // We may have a bitcast of something that has already had this bitcast
16300   // combine performed on it, so skip past any VECTOR_REG_CASTs.
16301   while (Src.getOpcode() == ARMISD::VECTOR_REG_CAST)
16302     Src = Src.getOperand(0);
16303 
16304   // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that
16305   // would be generated is at least the width of the element type.
16306   EVT SrcVT = Src.getValueType();
16307   if ((Src.getOpcode() == ARMISD::VMOVIMM ||
16308        Src.getOpcode() == ARMISD::VMVNIMM ||
16309        Src.getOpcode() == ARMISD::VMOVFPIMM) &&
16310       SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() &&
16311       DAG.getDataLayout().isBigEndian())
16312     return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src);
16313 
16314   return SDValue();
16315 }
16316 
16317 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
16318                                              DAGCombinerInfo &DCI) const {
16319   switch (N->getOpcode()) {
16320   default: break;
16321   case ISD::SELECT_CC:
16322   case ISD::SELECT:     return PerformSELECTCombine(N, DCI, Subtarget);
16323   case ISD::VSELECT:    return PerformVSELECTCombine(N, DCI, Subtarget);
16324   case ISD::ABS:        return PerformABSCombine(N, DCI, Subtarget);
16325   case ARMISD::ADDE:    return PerformADDECombine(N, DCI, Subtarget);
16326   case ARMISD::UMLAL:   return PerformUMLALCombine(N, DCI.DAG, Subtarget);
16327   case ISD::ADD:        return PerformADDCombine(N, DCI, Subtarget);
16328   case ISD::SUB:        return PerformSUBCombine(N, DCI, Subtarget);
16329   case ISD::MUL:        return PerformMULCombine(N, DCI, Subtarget);
16330   case ISD::OR:         return PerformORCombine(N, DCI, Subtarget);
16331   case ISD::XOR:        return PerformXORCombine(N, DCI, Subtarget);
16332   case ISD::AND:        return PerformANDCombine(N, DCI, Subtarget);
16333   case ISD::BRCOND:
16334   case ISD::BR_CC:      return PerformHWLoopCombine(N, DCI, Subtarget);
16335   case ARMISD::ADDC:
16336   case ARMISD::SUBC:    return PerformAddcSubcCombine(N, DCI, Subtarget);
16337   case ARMISD::SUBE:    return PerformAddeSubeCombine(N, DCI, Subtarget);
16338   case ARMISD::BFI:     return PerformBFICombine(N, DCI);
16339   case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
16340   case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
16341   case ARMISD::VMOVhr:  return PerformVMOVhrCombine(N, DCI);
16342   case ARMISD::VMOVrh:  return PerformVMOVrhCombine(N, DCI);
16343   case ISD::STORE:      return PerformSTORECombine(N, DCI, Subtarget);
16344   case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
16345   case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
16346   case ISD::EXTRACT_VECTOR_ELT: return PerformExtractEltCombine(N, DCI);
16347   case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
16348   case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget);
16349   case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget);
16350   case ISD::FP_TO_SINT:
16351   case ISD::FP_TO_UINT:
16352     return PerformVCVTCombine(N, DCI.DAG, Subtarget);
16353   case ISD::FDIV:
16354     return PerformVDIVCombine(N, DCI.DAG, Subtarget);
16355   case ISD::INTRINSIC_WO_CHAIN:
16356     return PerformIntrinsicCombine(N, DCI);
16357   case ISD::SHL:
16358   case ISD::SRA:
16359   case ISD::SRL:
16360     return PerformShiftCombine(N, DCI, Subtarget);
16361   case ISD::SIGN_EXTEND:
16362   case ISD::ZERO_EXTEND:
16363   case ISD::ANY_EXTEND:
16364     return PerformExtendCombine(N, DCI.DAG, Subtarget);
16365   case ISD::FP_EXTEND:
16366     return PerformFPExtendCombine(N, DCI.DAG, Subtarget);
16367   case ISD::SMIN:
16368   case ISD::UMIN:
16369   case ISD::SMAX:
16370   case ISD::UMAX:
16371     return PerformMinMaxCombine(N, DCI.DAG, Subtarget);
16372   case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
16373   case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG);
16374   case ISD::LOAD:       return PerformLOADCombine(N, DCI);
16375   case ARMISD::VLD1DUP:
16376   case ARMISD::VLD2DUP:
16377   case ARMISD::VLD3DUP:
16378   case ARMISD::VLD4DUP:
16379     return PerformVLDCombine(N, DCI);
16380   case ARMISD::BUILD_VECTOR:
16381     return PerformARMBUILD_VECTORCombine(N, DCI);
16382   case ISD::BITCAST:
16383     return PerformBITCASTCombine(N, DCI.DAG, Subtarget);
16384   case ARMISD::PREDICATE_CAST:
16385     return PerformPREDICATE_CASTCombine(N, DCI);
16386   case ARMISD::VECTOR_REG_CAST:
16387     return PerformVECTOR_REG_CASTCombine(N, DCI, Subtarget);
16388   case ARMISD::VCMP:
16389     return PerformVCMPCombine(N, DCI, Subtarget);
16390   case ISD::VECREDUCE_ADD:
16391     return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget);
16392   case ARMISD::VMOVN:
16393     return PerformVMOVNCombine(N, DCI);
16394   case ARMISD::VQMOVNs:
16395   case ARMISD::VQMOVNu:
16396     return PerformVQMOVNCombine(N, DCI);
16397   case ARMISD::ASRL:
16398   case ARMISD::LSRL:
16399   case ARMISD::LSLL:
16400     return PerformLongShiftCombine(N, DCI.DAG);
16401   case ARMISD::SMULWB: {
16402     unsigned BitWidth = N->getValueType(0).getSizeInBits();
16403     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
16404     if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
16405       return SDValue();
16406     break;
16407   }
16408   case ARMISD::SMULWT: {
16409     unsigned BitWidth = N->getValueType(0).getSizeInBits();
16410     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
16411     if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
16412       return SDValue();
16413     break;
16414   }
16415   case ARMISD::SMLALBB:
16416   case ARMISD::QADD16b:
16417   case ARMISD::QSUB16b: {
16418     unsigned BitWidth = N->getValueType(0).getSizeInBits();
16419     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
16420     if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
16421         (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
16422       return SDValue();
16423     break;
16424   }
16425   case ARMISD::SMLALBT: {
16426     unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
16427     APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
16428     unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
16429     APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
16430     if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||
16431         (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
16432       return SDValue();
16433     break;
16434   }
16435   case ARMISD::SMLALTB: {
16436     unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
16437     APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
16438     unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
16439     APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
16440     if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||
16441         (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
16442       return SDValue();
16443     break;
16444   }
16445   case ARMISD::SMLALTT: {
16446     unsigned BitWidth = N->getValueType(0).getSizeInBits();
16447     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
16448     if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
16449         (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
16450       return SDValue();
16451     break;
16452   }
16453   case ARMISD::QADD8b:
16454   case ARMISD::QSUB8b: {
16455     unsigned BitWidth = N->getValueType(0).getSizeInBits();
16456     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
16457     if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
16458         (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
16459       return SDValue();
16460     break;
16461   }
16462   case ISD::INTRINSIC_VOID:
16463   case ISD::INTRINSIC_W_CHAIN:
16464     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
16465     case Intrinsic::arm_neon_vld1:
16466     case Intrinsic::arm_neon_vld1x2:
16467     case Intrinsic::arm_neon_vld1x3:
16468     case Intrinsic::arm_neon_vld1x4:
16469     case Intrinsic::arm_neon_vld2:
16470     case Intrinsic::arm_neon_vld3:
16471     case Intrinsic::arm_neon_vld4:
16472     case Intrinsic::arm_neon_vld2lane:
16473     case Intrinsic::arm_neon_vld3lane:
16474     case Intrinsic::arm_neon_vld4lane:
16475     case Intrinsic::arm_neon_vld2dup:
16476     case Intrinsic::arm_neon_vld3dup:
16477     case Intrinsic::arm_neon_vld4dup:
16478     case Intrinsic::arm_neon_vst1:
16479     case Intrinsic::arm_neon_vst1x2:
16480     case Intrinsic::arm_neon_vst1x3:
16481     case Intrinsic::arm_neon_vst1x4:
16482     case Intrinsic::arm_neon_vst2:
16483     case Intrinsic::arm_neon_vst3:
16484     case Intrinsic::arm_neon_vst4:
16485     case Intrinsic::arm_neon_vst2lane:
16486     case Intrinsic::arm_neon_vst3lane:
16487     case Intrinsic::arm_neon_vst4lane:
16488       return PerformVLDCombine(N, DCI);
16489     case Intrinsic::arm_mve_vld2q:
16490     case Intrinsic::arm_mve_vld4q:
16491     case Intrinsic::arm_mve_vst2q:
16492     case Intrinsic::arm_mve_vst4q:
16493       return PerformMVEVLDCombine(N, DCI);
16494     default: break;
16495     }
16496     break;
16497   }
16498   return SDValue();
16499 }
16500 
16501 bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
16502                                                           EVT VT) const {
16503   return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
16504 }
16505 
16506 bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
16507                                                        unsigned Alignment,
16508                                                        MachineMemOperand::Flags,
16509                                                        bool *Fast) const {
16510   // Depends what it gets converted into if the type is weird.
16511   if (!VT.isSimple())
16512     return false;
16513 
16514   // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus
16515   bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
16516   auto Ty = VT.getSimpleVT().SimpleTy;
16517 
16518   if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {
16519     // Unaligned access can use (for example) LRDB, LRDH, LDR
16520     if (AllowsUnaligned) {
16521       if (Fast)
16522         *Fast = Subtarget->hasV7Ops();
16523       return true;
16524     }
16525   }
16526 
16527   if (Ty == MVT::f64 || Ty == MVT::v2f64) {
16528     // For any little-endian targets with neon, we can support unaligned ld/st
16529     // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
16530     // A big-endian target may also explicitly support unaligned accesses
16531     if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
16532       if (Fast)
16533         *Fast = true;
16534       return true;
16535     }
16536   }
16537 
16538   if (!Subtarget->hasMVEIntegerOps())
16539     return false;
16540 
16541   // These are for predicates
16542   if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1)) {
16543     if (Fast)
16544       *Fast = true;
16545     return true;
16546   }
16547 
16548   // These are for truncated stores/narrowing loads. They are fine so long as
16549   // the alignment is at least the size of the item being loaded
16550   if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) &&
16551       Alignment >= VT.getScalarSizeInBits() / 8) {
16552     if (Fast)
16553       *Fast = true;
16554     return true;
16555   }
16556 
16557   // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and
16558   // VSTRW.U32 all store the vector register in exactly the same format, and
16559   // differ only in the range of their immediate offset field and the required
16560   // alignment. So there is always a store that can be used, regardless of
16561   // actual type.
16562   //
16563   // For big endian, that is not the case. But can still emit a (VSTRB.U8;
16564   // VREV64.8) pair and get the same effect. This will likely be better than
16565   // aligning the vector through the stack.
16566   if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 ||
16567       Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||
16568       Ty == MVT::v2f64) {
16569     if (Fast)
16570       *Fast = true;
16571     return true;
16572   }
16573 
16574   return false;
16575 }
16576 
16577 
16578 EVT ARMTargetLowering::getOptimalMemOpType(
16579     const MemOp &Op, const AttributeList &FuncAttributes) const {
16580   // See if we can use NEON instructions for this...
16581   if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() &&
16582       !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
16583     bool Fast;
16584     if (Op.size() >= 16 &&
16585         (Op.isAligned(Align(16)) ||
16586          (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1,
16587                                          MachineMemOperand::MONone, &Fast) &&
16588           Fast))) {
16589       return MVT::v2f64;
16590     } else if (Op.size() >= 8 &&
16591                (Op.isAligned(Align(8)) ||
16592                 (allowsMisalignedMemoryAccesses(
16593                      MVT::f64, 0, 1, MachineMemOperand::MONone, &Fast) &&
16594                  Fast))) {
16595       return MVT::f64;
16596     }
16597   }
16598 
16599   // Let the target-independent logic figure it out.
16600   return MVT::Other;
16601 }
16602 
16603 // 64-bit integers are split into their high and low parts and held in two
16604 // different registers, so the trunc is free since the low register can just
16605 // be used.
16606 bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
16607   if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
16608     return false;
16609   unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
16610   unsigned DestBits = DstTy->getPrimitiveSizeInBits();
16611   return (SrcBits == 64 && DestBits == 32);
16612 }
16613 
16614 bool ARMTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const {
16615   if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||
16616       !DstVT.isInteger())
16617     return false;
16618   unsigned SrcBits = SrcVT.getSizeInBits();
16619   unsigned DestBits = DstVT.getSizeInBits();
16620   return (SrcBits == 64 && DestBits == 32);
16621 }
16622 
16623 bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
16624   if (Val.getOpcode() != ISD::LOAD)
16625     return false;
16626 
16627   EVT VT1 = Val.getValueType();
16628   if (!VT1.isSimple() || !VT1.isInteger() ||
16629       !VT2.isSimple() || !VT2.isInteger())
16630     return false;
16631 
16632   switch (VT1.getSimpleVT().SimpleTy) {
16633   default: break;
16634   case MVT::i1:
16635   case MVT::i8:
16636   case MVT::i16:
16637     // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
16638     return true;
16639   }
16640 
16641   return false;
16642 }
16643 
16644 bool ARMTargetLowering::isFNegFree(EVT VT) const {
16645   if (!VT.isSimple())
16646     return false;
16647 
16648   // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
16649   // negate values directly (fneg is free). So, we don't want to let the DAG
16650   // combiner rewrite fneg into xors and some other instructions.  For f16 and
16651   // FullFP16 argument passing, some bitcast nodes may be introduced,
16652   // triggering this DAG combine rewrite, so we are avoiding that with this.
16653   switch (VT.getSimpleVT().SimpleTy) {
16654   default: break;
16655   case MVT::f16:
16656     return Subtarget->hasFullFP16();
16657   }
16658 
16659   return false;
16660 }
16661 
16662 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
16663 /// of the vector elements.
16664 static bool areExtractExts(Value *Ext1, Value *Ext2) {
16665   auto areExtDoubled = [](Instruction *Ext) {
16666     return Ext->getType()->getScalarSizeInBits() ==
16667            2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
16668   };
16669 
16670   if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
16671       !match(Ext2, m_ZExtOrSExt(m_Value())) ||
16672       !areExtDoubled(cast<Instruction>(Ext1)) ||
16673       !areExtDoubled(cast<Instruction>(Ext2)))
16674     return false;
16675 
16676   return true;
16677 }
16678 
16679 /// Check if sinking \p I's operands to I's basic block is profitable, because
16680 /// the operands can be folded into a target instruction, e.g.
16681 /// sext/zext can be folded into vsubl.
16682 bool ARMTargetLowering::shouldSinkOperands(Instruction *I,
16683                                            SmallVectorImpl<Use *> &Ops) const {
16684   if (!I->getType()->isVectorTy())
16685     return false;
16686 
16687   if (Subtarget->hasNEON()) {
16688     switch (I->getOpcode()) {
16689     case Instruction::Sub:
16690     case Instruction::Add: {
16691       if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
16692         return false;
16693       Ops.push_back(&I->getOperandUse(0));
16694       Ops.push_back(&I->getOperandUse(1));
16695       return true;
16696     }
16697     default:
16698       return false;
16699     }
16700   }
16701 
16702   if (!Subtarget->hasMVEIntegerOps())
16703     return false;
16704 
16705   auto IsFMSMul = [&](Instruction *I) {
16706     if (!I->hasOneUse())
16707       return false;
16708     auto *Sub = cast<Instruction>(*I->users().begin());
16709     return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I;
16710   };
16711   auto IsFMS = [&](Instruction *I) {
16712     if (match(I->getOperand(0), m_FNeg(m_Value())) ||
16713         match(I->getOperand(1), m_FNeg(m_Value())))
16714       return true;
16715     return false;
16716   };
16717 
16718   auto IsSinker = [&](Instruction *I, int Operand) {
16719     switch (I->getOpcode()) {
16720     case Instruction::Add:
16721     case Instruction::Mul:
16722     case Instruction::FAdd:
16723     case Instruction::ICmp:
16724     case Instruction::FCmp:
16725       return true;
16726     case Instruction::FMul:
16727       return !IsFMSMul(I);
16728     case Instruction::Sub:
16729     case Instruction::FSub:
16730     case Instruction::Shl:
16731     case Instruction::LShr:
16732     case Instruction::AShr:
16733       return Operand == 1;
16734     case Instruction::Call:
16735       if (auto *II = dyn_cast<IntrinsicInst>(I)) {
16736         switch (II->getIntrinsicID()) {
16737         case Intrinsic::fma:
16738           return !IsFMS(I);
16739         case Intrinsic::arm_mve_add_predicated:
16740         case Intrinsic::arm_mve_mul_predicated:
16741         case Intrinsic::arm_mve_qadd_predicated:
16742         case Intrinsic::arm_mve_hadd_predicated:
16743         case Intrinsic::arm_mve_vqdmull_predicated:
16744         case Intrinsic::arm_mve_qdmulh_predicated:
16745         case Intrinsic::arm_mve_qrdmulh_predicated:
16746         case Intrinsic::arm_mve_fma_predicated:
16747           return true;
16748         case Intrinsic::arm_mve_sub_predicated:
16749         case Intrinsic::arm_mve_qsub_predicated:
16750         case Intrinsic::arm_mve_hsub_predicated:
16751           return Operand == 1;
16752         default:
16753           return false;
16754         }
16755       }
16756       return false;
16757     default:
16758       return false;
16759     }
16760   };
16761 
16762   for (auto OpIdx : enumerate(I->operands())) {
16763     Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
16764     // Make sure we are not already sinking this operand
16765     if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
16766       continue;
16767 
16768     Instruction *Shuffle = Op;
16769     if (Shuffle->getOpcode() == Instruction::BitCast)
16770       Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0));
16771     // We are looking for a splat that can be sunk.
16772     if (!Shuffle ||
16773         !match(Shuffle, m_Shuffle(
16774                             m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()),
16775                             m_Undef(), m_ZeroMask())))
16776       continue;
16777     if (!IsSinker(I, OpIdx.index()))
16778       continue;
16779 
16780     // All uses of the shuffle should be sunk to avoid duplicating it across gpr
16781     // and vector registers
16782     for (Use &U : Op->uses()) {
16783       Instruction *Insn = cast<Instruction>(U.getUser());
16784       if (!IsSinker(Insn, U.getOperandNo()))
16785         return false;
16786     }
16787 
16788     Ops.push_back(&Shuffle->getOperandUse(0));
16789     if (Shuffle != Op)
16790       Ops.push_back(&Op->getOperandUse(0));
16791     Ops.push_back(&OpIdx.value());
16792   }
16793   return true;
16794 }
16795 
16796 Type *ARMTargetLowering::shouldConvertSplatType(ShuffleVectorInst *SVI) const {
16797   if (!Subtarget->hasMVEIntegerOps())
16798     return nullptr;
16799   Type *SVIType = SVI->getType();
16800   Type *ScalarType = SVIType->getScalarType();
16801 
16802   if (ScalarType->isFloatTy())
16803     return Type::getInt32Ty(SVIType->getContext());
16804   if (ScalarType->isHalfTy())
16805     return Type::getInt16Ty(SVIType->getContext());
16806   return nullptr;
16807 }
16808 
16809 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
16810   EVT VT = ExtVal.getValueType();
16811 
16812   if (!isTypeLegal(VT))
16813     return false;
16814 
16815   if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) {
16816     if (Ld->isExpandingLoad())
16817       return false;
16818   }
16819 
16820   if (Subtarget->hasMVEIntegerOps())
16821     return true;
16822 
16823   // Don't create a loadext if we can fold the extension into a wide/long
16824   // instruction.
16825   // If there's more than one user instruction, the loadext is desirable no
16826   // matter what.  There can be two uses by the same instruction.
16827   if (ExtVal->use_empty() ||
16828       !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode()))
16829     return true;
16830 
16831   SDNode *U = *ExtVal->use_begin();
16832   if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
16833        U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM))
16834     return false;
16835 
16836   return true;
16837 }
16838 
16839 bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
16840   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
16841     return false;
16842 
16843   if (!isTypeLegal(EVT::getEVT(Ty1)))
16844     return false;
16845 
16846   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
16847 
16848   // Assuming the caller doesn't have a zeroext or signext return parameter,
16849   // truncation all the way down to i1 is valid.
16850   return true;
16851 }
16852 
16853 int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL,
16854                                                 const AddrMode &AM, Type *Ty,
16855                                                 unsigned AS) const {
16856   if (isLegalAddressingMode(DL, AM, Ty, AS)) {
16857     if (Subtarget->hasFPAO())
16858       return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
16859     return 0;
16860   }
16861   return -1;
16862 }
16863 
16864 /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
16865 /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
16866 /// expanded to FMAs when this method returns true, otherwise fmuladd is
16867 /// expanded to fmul + fadd.
16868 ///
16869 /// ARM supports both fused and unfused multiply-add operations; we already
16870 /// lower a pair of fmul and fadd to the latter so it's not clear that there
16871 /// would be a gain or that the gain would be worthwhile enough to risk
16872 /// correctness bugs.
16873 ///
16874 /// For MVE, we set this to true as it helps simplify the need for some
16875 /// patterns (and we don't have the non-fused floating point instruction).
16876 bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
16877                                                    EVT VT) const {
16878   if (!VT.isSimple())
16879     return false;
16880 
16881   switch (VT.getSimpleVT().SimpleTy) {
16882   case MVT::v4f32:
16883   case MVT::v8f16:
16884     return Subtarget->hasMVEFloatOps();
16885   case MVT::f16:
16886     return Subtarget->useFPVFMx16();
16887   case MVT::f32:
16888     return Subtarget->useFPVFMx();
16889   case MVT::f64:
16890     return Subtarget->useFPVFMx64();
16891   default:
16892     break;
16893   }
16894 
16895   return false;
16896 }
16897 
16898 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
16899   if (V < 0)
16900     return false;
16901 
16902   unsigned Scale = 1;
16903   switch (VT.getSimpleVT().SimpleTy) {
16904   case MVT::i1:
16905   case MVT::i8:
16906     // Scale == 1;
16907     break;
16908   case MVT::i16:
16909     // Scale == 2;
16910     Scale = 2;
16911     break;
16912   default:
16913     // On thumb1 we load most things (i32, i64, floats, etc) with a LDR
16914     // Scale == 4;
16915     Scale = 4;
16916     break;
16917   }
16918 
16919   if ((V & (Scale - 1)) != 0)
16920     return false;
16921   return isUInt<5>(V / Scale);
16922 }
16923 
16924 static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
16925                                       const ARMSubtarget *Subtarget) {
16926   if (!VT.isInteger() && !VT.isFloatingPoint())
16927     return false;
16928   if (VT.isVector() && Subtarget->hasNEON())
16929     return false;
16930   if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() &&
16931       !Subtarget->hasMVEFloatOps())
16932     return false;
16933 
16934   bool IsNeg = false;
16935   if (V < 0) {
16936     IsNeg = true;
16937     V = -V;
16938   }
16939 
16940   unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U);
16941 
16942   // MVE: size * imm7
16943   if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
16944     switch (VT.getSimpleVT().getVectorElementType().SimpleTy) {
16945     case MVT::i32:
16946     case MVT::f32:
16947       return isShiftedUInt<7,2>(V);
16948     case MVT::i16:
16949     case MVT::f16:
16950       return isShiftedUInt<7,1>(V);
16951     case MVT::i8:
16952       return isUInt<7>(V);
16953     default:
16954       return false;
16955     }
16956   }
16957 
16958   // half VLDR: 2 * imm8
16959   if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16())
16960     return isShiftedUInt<8, 1>(V);
16961   // VLDR and LDRD: 4 * imm8
16962   if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8)
16963     return isShiftedUInt<8, 2>(V);
16964 
16965   if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) {
16966     // + imm12 or - imm8
16967     if (IsNeg)
16968       return isUInt<8>(V);
16969     return isUInt<12>(V);
16970   }
16971 
16972   return false;
16973 }
16974 
16975 /// isLegalAddressImmediate - Return true if the integer value can be used
16976 /// as the offset of the target addressing mode for load / store of the
16977 /// given type.
16978 static bool isLegalAddressImmediate(int64_t V, EVT VT,
16979                                     const ARMSubtarget *Subtarget) {
16980   if (V == 0)
16981     return true;
16982 
16983   if (!VT.isSimple())
16984     return false;
16985 
16986   if (Subtarget->isThumb1Only())
16987     return isLegalT1AddressImmediate(V, VT);
16988   else if (Subtarget->isThumb2())
16989     return isLegalT2AddressImmediate(V, VT, Subtarget);
16990 
16991   // ARM mode.
16992   if (V < 0)
16993     V = - V;
16994   switch (VT.getSimpleVT().SimpleTy) {
16995   default: return false;
16996   case MVT::i1:
16997   case MVT::i8:
16998   case MVT::i32:
16999     // +- imm12
17000     return isUInt<12>(V);
17001   case MVT::i16:
17002     // +- imm8
17003     return isUInt<8>(V);
17004   case MVT::f32:
17005   case MVT::f64:
17006     if (!Subtarget->hasVFP2Base()) // FIXME: NEON?
17007       return false;
17008     return isShiftedUInt<8, 2>(V);
17009   }
17010 }
17011 
17012 bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
17013                                                       EVT VT) const {
17014   int Scale = AM.Scale;
17015   if (Scale < 0)
17016     return false;
17017 
17018   switch (VT.getSimpleVT().SimpleTy) {
17019   default: return false;
17020   case MVT::i1:
17021   case MVT::i8:
17022   case MVT::i16:
17023   case MVT::i32:
17024     if (Scale == 1)
17025       return true;
17026     // r + r << imm
17027     Scale = Scale & ~1;
17028     return Scale == 2 || Scale == 4 || Scale == 8;
17029   case MVT::i64:
17030     // FIXME: What are we trying to model here? ldrd doesn't have an r + r
17031     // version in Thumb mode.
17032     // r + r
17033     if (Scale == 1)
17034       return true;
17035     // r * 2 (this can be lowered to r + r).
17036     if (!AM.HasBaseReg && Scale == 2)
17037       return true;
17038     return false;
17039   case MVT::isVoid:
17040     // Note, we allow "void" uses (basically, uses that aren't loads or
17041     // stores), because arm allows folding a scale into many arithmetic
17042     // operations.  This should be made more precise and revisited later.
17043 
17044     // Allow r << imm, but the imm has to be a multiple of two.
17045     if (Scale & 1) return false;
17046     return isPowerOf2_32(Scale);
17047   }
17048 }
17049 
17050 bool ARMTargetLowering::isLegalT1ScaledAddressingMode(const AddrMode &AM,
17051                                                       EVT VT) const {
17052   const int Scale = AM.Scale;
17053 
17054   // Negative scales are not supported in Thumb1.
17055   if (Scale < 0)
17056     return false;
17057 
17058   // Thumb1 addressing modes do not support register scaling excepting the
17059   // following cases:
17060   // 1. Scale == 1 means no scaling.
17061   // 2. Scale == 2 this can be lowered to r + r if there is no base register.
17062   return (Scale == 1) || (!AM.HasBaseReg && Scale == 2);
17063 }
17064 
17065 /// isLegalAddressingMode - Return true if the addressing mode represented
17066 /// by AM is legal for this target, for a load/store of the specified type.
17067 bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL,
17068                                               const AddrMode &AM, Type *Ty,
17069                                               unsigned AS, Instruction *I) const {
17070   EVT VT = getValueType(DL, Ty, true);
17071   if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
17072     return false;
17073 
17074   // Can never fold addr of global into load/store.
17075   if (AM.BaseGV)
17076     return false;
17077 
17078   switch (AM.Scale) {
17079   case 0:  // no scale reg, must be "r+i" or "r", or "i".
17080     break;
17081   default:
17082     // ARM doesn't support any R+R*scale+imm addr modes.
17083     if (AM.BaseOffs)
17084       return false;
17085 
17086     if (!VT.isSimple())
17087       return false;
17088 
17089     if (Subtarget->isThumb1Only())
17090       return isLegalT1ScaledAddressingMode(AM, VT);
17091 
17092     if (Subtarget->isThumb2())
17093       return isLegalT2ScaledAddressingMode(AM, VT);
17094 
17095     int Scale = AM.Scale;
17096     switch (VT.getSimpleVT().SimpleTy) {
17097     default: return false;
17098     case MVT::i1:
17099     case MVT::i8:
17100     case MVT::i32:
17101       if (Scale < 0) Scale = -Scale;
17102       if (Scale == 1)
17103         return true;
17104       // r + r << imm
17105       return isPowerOf2_32(Scale & ~1);
17106     case MVT::i16:
17107     case MVT::i64:
17108       // r +/- r
17109       if (Scale == 1 || (AM.HasBaseReg && Scale == -1))
17110         return true;
17111       // r * 2 (this can be lowered to r + r).
17112       if (!AM.HasBaseReg && Scale == 2)
17113         return true;
17114       return false;
17115 
17116     case MVT::isVoid:
17117       // Note, we allow "void" uses (basically, uses that aren't loads or
17118       // stores), because arm allows folding a scale into many arithmetic
17119       // operations.  This should be made more precise and revisited later.
17120 
17121       // Allow r << imm, but the imm has to be a multiple of two.
17122       if (Scale & 1) return false;
17123       return isPowerOf2_32(Scale);
17124     }
17125   }
17126   return true;
17127 }
17128 
17129 /// isLegalICmpImmediate - Return true if the specified immediate is legal
17130 /// icmp immediate, that is the target has icmp instructions which can compare
17131 /// a register against the immediate without having to materialize the
17132 /// immediate into a register.
17133 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
17134   // Thumb2 and ARM modes can use cmn for negative immediates.
17135   if (!Subtarget->isThumb())
17136     return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||
17137            ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
17138   if (Subtarget->isThumb2())
17139     return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||
17140            ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
17141   // Thumb1 doesn't have cmn, and only 8-bit immediates.
17142   return Imm >= 0 && Imm <= 255;
17143 }
17144 
17145 /// isLegalAddImmediate - Return true if the specified immediate is a legal add
17146 /// *or sub* immediate, that is the target has add or sub instructions which can
17147 /// add a register with the immediate without having to materialize the
17148 /// immediate into a register.
17149 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
17150   // Same encoding for add/sub, just flip the sign.
17151   int64_t AbsImm = std::abs(Imm);
17152   if (!Subtarget->isThumb())
17153     return ARM_AM::getSOImmVal(AbsImm) != -1;
17154   if (Subtarget->isThumb2())
17155     return ARM_AM::getT2SOImmVal(AbsImm) != -1;
17156   // Thumb1 only has 8-bit unsigned immediate.
17157   return AbsImm >= 0 && AbsImm <= 255;
17158 }
17159 
17160 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
17161                                       bool isSEXTLoad, SDValue &Base,
17162                                       SDValue &Offset, bool &isInc,
17163                                       SelectionDAG &DAG) {
17164   if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
17165     return false;
17166 
17167   if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
17168     // AddressingMode 3
17169     Base = Ptr->getOperand(0);
17170     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
17171       int RHSC = (int)RHS->getZExtValue();
17172       if (RHSC < 0 && RHSC > -256) {
17173         assert(Ptr->getOpcode() == ISD::ADD);
17174         isInc = false;
17175         Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
17176         return true;
17177       }
17178     }
17179     isInc = (Ptr->getOpcode() == ISD::ADD);
17180     Offset = Ptr->getOperand(1);
17181     return true;
17182   } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
17183     // AddressingMode 2
17184     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
17185       int RHSC = (int)RHS->getZExtValue();
17186       if (RHSC < 0 && RHSC > -0x1000) {
17187         assert(Ptr->getOpcode() == ISD::ADD);
17188         isInc = false;
17189         Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
17190         Base = Ptr->getOperand(0);
17191         return true;
17192       }
17193     }
17194 
17195     if (Ptr->getOpcode() == ISD::ADD) {
17196       isInc = true;
17197       ARM_AM::ShiftOpc ShOpcVal=
17198         ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
17199       if (ShOpcVal != ARM_AM::no_shift) {
17200         Base = Ptr->getOperand(1);
17201         Offset = Ptr->getOperand(0);
17202       } else {
17203         Base = Ptr->getOperand(0);
17204         Offset = Ptr->getOperand(1);
17205       }
17206       return true;
17207     }
17208 
17209     isInc = (Ptr->getOpcode() == ISD::ADD);
17210     Base = Ptr->getOperand(0);
17211     Offset = Ptr->getOperand(1);
17212     return true;
17213   }
17214 
17215   // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
17216   return false;
17217 }
17218 
17219 static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
17220                                      bool isSEXTLoad, SDValue &Base,
17221                                      SDValue &Offset, bool &isInc,
17222                                      SelectionDAG &DAG) {
17223   if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
17224     return false;
17225 
17226   Base = Ptr->getOperand(0);
17227   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
17228     int RHSC = (int)RHS->getZExtValue();
17229     if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
17230       assert(Ptr->getOpcode() == ISD::ADD);
17231       isInc = false;
17232       Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
17233       return true;
17234     } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
17235       isInc = Ptr->getOpcode() == ISD::ADD;
17236       Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
17237       return true;
17238     }
17239   }
17240 
17241   return false;
17242 }
17243 
17244 static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment,
17245                                       bool isSEXTLoad, bool IsMasked, bool isLE,
17246                                       SDValue &Base, SDValue &Offset,
17247                                       bool &isInc, SelectionDAG &DAG) {
17248   if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
17249     return false;
17250   if (!isa<ConstantSDNode>(Ptr->getOperand(1)))
17251     return false;
17252 
17253   // We allow LE non-masked loads to change the type (for example use a vldrb.8
17254   // as opposed to a vldrw.32). This can allow extra addressing modes or
17255   // alignments for what is otherwise an equivalent instruction.
17256   bool CanChangeType = isLE && !IsMasked;
17257 
17258   ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1));
17259   int RHSC = (int)RHS->getZExtValue();
17260 
17261   auto IsInRange = [&](int RHSC, int Limit, int Scale) {
17262     if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) {
17263       assert(Ptr->getOpcode() == ISD::ADD);
17264       isInc = false;
17265       Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
17266       return true;
17267     } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) {
17268       isInc = Ptr->getOpcode() == ISD::ADD;
17269       Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
17270       return true;
17271     }
17272     return false;
17273   };
17274 
17275   // Try to find a matching instruction based on s/zext, Alignment, Offset and
17276   // (in BE/masked) type.
17277   Base = Ptr->getOperand(0);
17278   if (VT == MVT::v4i16) {
17279     if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2))
17280       return true;
17281   } else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
17282     if (IsInRange(RHSC, 0x80, 1))
17283       return true;
17284   } else if (Alignment >= 4 &&
17285              (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&
17286              IsInRange(RHSC, 0x80, 4))
17287     return true;
17288   else if (Alignment >= 2 &&
17289            (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&
17290            IsInRange(RHSC, 0x80, 2))
17291     return true;
17292   else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
17293     return true;
17294   return false;
17295 }
17296 
17297 /// getPreIndexedAddressParts - returns true by value, base pointer and
17298 /// offset pointer and addressing mode by reference if the node's address
17299 /// can be legally represented as pre-indexed load / store address.
17300 bool
17301 ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
17302                                              SDValue &Offset,
17303                                              ISD::MemIndexedMode &AM,
17304                                              SelectionDAG &DAG) const {
17305   if (Subtarget->isThumb1Only())
17306     return false;
17307 
17308   EVT VT;
17309   SDValue Ptr;
17310   Align Alignment;
17311   bool isSEXTLoad = false;
17312   bool IsMasked = false;
17313   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
17314     Ptr = LD->getBasePtr();
17315     VT = LD->getMemoryVT();
17316     Alignment = LD->getAlign();
17317     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
17318   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
17319     Ptr = ST->getBasePtr();
17320     VT = ST->getMemoryVT();
17321     Alignment = ST->getAlign();
17322   } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
17323     Ptr = LD->getBasePtr();
17324     VT = LD->getMemoryVT();
17325     Alignment = LD->getAlign();
17326     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
17327     IsMasked = true;
17328   } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
17329     Ptr = ST->getBasePtr();
17330     VT = ST->getMemoryVT();
17331     Alignment = ST->getAlign();
17332     IsMasked = true;
17333   } else
17334     return false;
17335 
17336   bool isInc;
17337   bool isLegal = false;
17338   if (VT.isVector())
17339     isLegal = Subtarget->hasMVEIntegerOps() &&
17340               getMVEIndexedAddressParts(
17341                   Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked,
17342                   Subtarget->isLittle(), Base, Offset, isInc, DAG);
17343   else {
17344     if (Subtarget->isThumb2())
17345       isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
17346                                          Offset, isInc, DAG);
17347     else
17348       isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
17349                                           Offset, isInc, DAG);
17350   }
17351   if (!isLegal)
17352     return false;
17353 
17354   AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
17355   return true;
17356 }
17357 
17358 /// getPostIndexedAddressParts - returns true by value, base pointer and
17359 /// offset pointer and addressing mode by reference if this node can be
17360 /// combined with a load / store to form a post-indexed load / store.
17361 bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
17362                                                    SDValue &Base,
17363                                                    SDValue &Offset,
17364                                                    ISD::MemIndexedMode &AM,
17365                                                    SelectionDAG &DAG) const {
17366   EVT VT;
17367   SDValue Ptr;
17368   Align Alignment;
17369   bool isSEXTLoad = false, isNonExt;
17370   bool IsMasked = false;
17371   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
17372     VT = LD->getMemoryVT();
17373     Ptr = LD->getBasePtr();
17374     Alignment = LD->getAlign();
17375     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
17376     isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
17377   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
17378     VT = ST->getMemoryVT();
17379     Ptr = ST->getBasePtr();
17380     Alignment = ST->getAlign();
17381     isNonExt = !ST->isTruncatingStore();
17382   } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
17383     VT = LD->getMemoryVT();
17384     Ptr = LD->getBasePtr();
17385     Alignment = LD->getAlign();
17386     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
17387     isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
17388     IsMasked = true;
17389   } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
17390     VT = ST->getMemoryVT();
17391     Ptr = ST->getBasePtr();
17392     Alignment = ST->getAlign();
17393     isNonExt = !ST->isTruncatingStore();
17394     IsMasked = true;
17395   } else
17396     return false;
17397 
17398   if (Subtarget->isThumb1Only()) {
17399     // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
17400     // must be non-extending/truncating, i32, with an offset of 4.
17401     assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
17402     if (Op->getOpcode() != ISD::ADD || !isNonExt)
17403       return false;
17404     auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
17405     if (!RHS || RHS->getZExtValue() != 4)
17406       return false;
17407 
17408     Offset = Op->getOperand(1);
17409     Base = Op->getOperand(0);
17410     AM = ISD::POST_INC;
17411     return true;
17412   }
17413 
17414   bool isInc;
17415   bool isLegal = false;
17416   if (VT.isVector())
17417     isLegal = Subtarget->hasMVEIntegerOps() &&
17418               getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked,
17419                                         Subtarget->isLittle(), Base, Offset,
17420                                         isInc, DAG);
17421   else {
17422     if (Subtarget->isThumb2())
17423       isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
17424                                          isInc, DAG);
17425     else
17426       isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
17427                                           isInc, DAG);
17428   }
17429   if (!isLegal)
17430     return false;
17431 
17432   if (Ptr != Base) {
17433     // Swap base ptr and offset to catch more post-index load / store when
17434     // it's legal. In Thumb2 mode, offset must be an immediate.
17435     if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
17436         !Subtarget->isThumb2())
17437       std::swap(Base, Offset);
17438 
17439     // Post-indexed load / store update the base pointer.
17440     if (Ptr != Base)
17441       return false;
17442   }
17443 
17444   AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
17445   return true;
17446 }
17447 
17448 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
17449                                                       KnownBits &Known,
17450                                                       const APInt &DemandedElts,
17451                                                       const SelectionDAG &DAG,
17452                                                       unsigned Depth) const {
17453   unsigned BitWidth = Known.getBitWidth();
17454   Known.resetAll();
17455   switch (Op.getOpcode()) {
17456   default: break;
17457   case ARMISD::ADDC:
17458   case ARMISD::ADDE:
17459   case ARMISD::SUBC:
17460   case ARMISD::SUBE:
17461     // Special cases when we convert a carry to a boolean.
17462     if (Op.getResNo() == 0) {
17463       SDValue LHS = Op.getOperand(0);
17464       SDValue RHS = Op.getOperand(1);
17465       // (ADDE 0, 0, C) will give us a single bit.
17466       if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) &&
17467           isNullConstant(RHS)) {
17468         Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
17469         return;
17470       }
17471     }
17472     break;
17473   case ARMISD::CMOV: {
17474     // Bits are known zero/one if known on the LHS and RHS.
17475     Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
17476     if (Known.isUnknown())
17477       return;
17478 
17479     KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
17480     Known = KnownBits::commonBits(Known, KnownRHS);
17481     return;
17482   }
17483   case ISD::INTRINSIC_W_CHAIN: {
17484     ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
17485     Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
17486     switch (IntID) {
17487     default: return;
17488     case Intrinsic::arm_ldaex:
17489     case Intrinsic::arm_ldrex: {
17490       EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
17491       unsigned MemBits = VT.getScalarSizeInBits();
17492       Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
17493       return;
17494     }
17495     }
17496   }
17497   case ARMISD::BFI: {
17498     // Conservatively, we can recurse down the first operand
17499     // and just mask out all affected bits.
17500     Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
17501 
17502     // The operand to BFI is already a mask suitable for removing the bits it
17503     // sets.
17504     ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2));
17505     const APInt &Mask = CI->getAPIntValue();
17506     Known.Zero &= Mask;
17507     Known.One &= Mask;
17508     return;
17509   }
17510   case ARMISD::VGETLANEs:
17511   case ARMISD::VGETLANEu: {
17512     const SDValue &SrcSV = Op.getOperand(0);
17513     EVT VecVT = SrcSV.getValueType();
17514     assert(VecVT.isVector() && "VGETLANE expected a vector type");
17515     const unsigned NumSrcElts = VecVT.getVectorNumElements();
17516     ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode());
17517     assert(Pos->getAPIntValue().ult(NumSrcElts) &&
17518            "VGETLANE index out of bounds");
17519     unsigned Idx = Pos->getZExtValue();
17520     APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
17521     Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1);
17522 
17523     EVT VT = Op.getValueType();
17524     const unsigned DstSz = VT.getScalarSizeInBits();
17525     const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();
17526     (void)SrcSz;
17527     assert(SrcSz == Known.getBitWidth());
17528     assert(DstSz > SrcSz);
17529     if (Op.getOpcode() == ARMISD::VGETLANEs)
17530       Known = Known.sext(DstSz);
17531     else {
17532       Known = Known.zext(DstSz);
17533     }
17534     assert(DstSz == Known.getBitWidth());
17535     break;
17536   }
17537   case ARMISD::VMOVrh: {
17538     KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
17539     assert(KnownOp.getBitWidth() == 16);
17540     Known = KnownOp.zext(32);
17541     break;
17542   }
17543   }
17544 }
17545 
17546 bool ARMTargetLowering::targetShrinkDemandedConstant(
17547     SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
17548     TargetLoweringOpt &TLO) const {
17549   // Delay optimization, so we don't have to deal with illegal types, or block
17550   // optimizations.
17551   if (!TLO.LegalOps)
17552     return false;
17553 
17554   // Only optimize AND for now.
17555   if (Op.getOpcode() != ISD::AND)
17556     return false;
17557 
17558   EVT VT = Op.getValueType();
17559 
17560   // Ignore vectors.
17561   if (VT.isVector())
17562     return false;
17563 
17564   assert(VT == MVT::i32 && "Unexpected integer type");
17565 
17566   // Make sure the RHS really is a constant.
17567   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
17568   if (!C)
17569     return false;
17570 
17571   unsigned Mask = C->getZExtValue();
17572 
17573   unsigned Demanded = DemandedBits.getZExtValue();
17574   unsigned ShrunkMask = Mask & Demanded;
17575   unsigned ExpandedMask = Mask | ~Demanded;
17576 
17577   // If the mask is all zeros, let the target-independent code replace the
17578   // result with zero.
17579   if (ShrunkMask == 0)
17580     return false;
17581 
17582   // If the mask is all ones, erase the AND. (Currently, the target-independent
17583   // code won't do this, so we have to do it explicitly to avoid an infinite
17584   // loop in obscure cases.)
17585   if (ExpandedMask == ~0U)
17586     return TLO.CombineTo(Op, Op.getOperand(0));
17587 
17588   auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool {
17589     return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0;
17590   };
17591   auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool {
17592     if (NewMask == Mask)
17593       return true;
17594     SDLoc DL(Op);
17595     SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
17596     SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
17597     return TLO.CombineTo(Op, NewOp);
17598   };
17599 
17600   // Prefer uxtb mask.
17601   if (IsLegalMask(0xFF))
17602     return UseMask(0xFF);
17603 
17604   // Prefer uxth mask.
17605   if (IsLegalMask(0xFFFF))
17606     return UseMask(0xFFFF);
17607 
17608   // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.
17609   // FIXME: Prefer a contiguous sequence of bits for other optimizations.
17610   if (ShrunkMask < 256)
17611     return UseMask(ShrunkMask);
17612 
17613   // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.
17614   // FIXME: Prefer a contiguous sequence of bits for other optimizations.
17615   if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256)
17616     return UseMask(ExpandedMask);
17617 
17618   // Potential improvements:
17619   //
17620   // We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
17621   // We could try to prefer Thumb1 immediates which can be lowered to a
17622   // two-instruction sequence.
17623   // We could try to recognize more legal ARM/Thumb2 immediates here.
17624 
17625   return false;
17626 }
17627 
17628 bool ARMTargetLowering::SimplifyDemandedBitsForTargetNode(
17629     SDValue Op, const APInt &OriginalDemandedBits,
17630     const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
17631     unsigned Depth) const {
17632   unsigned Opc = Op.getOpcode();
17633 
17634   switch (Opc) {
17635   case ARMISD::ASRL:
17636   case ARMISD::LSRL: {
17637     // If this is result 0 and the other result is unused, see if the demand
17638     // bits allow us to shrink this long shift into a standard small shift in
17639     // the opposite direction.
17640     if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) &&
17641         isa<ConstantSDNode>(Op->getOperand(2))) {
17642       unsigned ShAmt = Op->getConstantOperandVal(2);
17643       if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(
17644                             APInt::getAllOnesValue(32) << (32 - ShAmt)))
17645         return TLO.CombineTo(
17646             Op, TLO.DAG.getNode(
17647                     ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1),
17648                     TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32)));
17649     }
17650     break;
17651   }
17652   }
17653 
17654   return TargetLowering::SimplifyDemandedBitsForTargetNode(
17655       Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
17656 }
17657 
17658 //===----------------------------------------------------------------------===//
17659 //                           ARM Inline Assembly Support
17660 //===----------------------------------------------------------------------===//
17661 
17662 bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
17663   // Looking for "rev" which is V6+.
17664   if (!Subtarget->hasV6Ops())
17665     return false;
17666 
17667   InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
17668   std::string AsmStr = IA->getAsmString();
17669   SmallVector<StringRef, 4> AsmPieces;
17670   SplitString(AsmStr, AsmPieces, ";\n");
17671 
17672   switch (AsmPieces.size()) {
17673   default: return false;
17674   case 1:
17675     AsmStr = std::string(AsmPieces[0]);
17676     AsmPieces.clear();
17677     SplitString(AsmStr, AsmPieces, " \t,");
17678 
17679     // rev $0, $1
17680     if (AsmPieces.size() == 3 &&
17681         AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
17682         IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
17683       IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
17684       if (Ty && Ty->getBitWidth() == 32)
17685         return IntrinsicLowering::LowerToByteSwap(CI);
17686     }
17687     break;
17688   }
17689 
17690   return false;
17691 }
17692 
17693 const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
17694   // At this point, we have to lower this constraint to something else, so we
17695   // lower it to an "r" or "w". However, by doing this we will force the result
17696   // to be in register, while the X constraint is much more permissive.
17697   //
17698   // Although we are correct (we are free to emit anything, without
17699   // constraints), we might break use cases that would expect us to be more
17700   // efficient and emit something else.
17701   if (!Subtarget->hasVFP2Base())
17702     return "r";
17703   if (ConstraintVT.isFloatingPoint())
17704     return "w";
17705   if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
17706      (ConstraintVT.getSizeInBits() == 64 ||
17707       ConstraintVT.getSizeInBits() == 128))
17708     return "w";
17709 
17710   return "r";
17711 }
17712 
17713 /// getConstraintType - Given a constraint letter, return the type of
17714 /// constraint it is for this target.
17715 ARMTargetLowering::ConstraintType
17716 ARMTargetLowering::getConstraintType(StringRef Constraint) const {
17717   unsigned S = Constraint.size();
17718   if (S == 1) {
17719     switch (Constraint[0]) {
17720     default:  break;
17721     case 'l': return C_RegisterClass;
17722     case 'w': return C_RegisterClass;
17723     case 'h': return C_RegisterClass;
17724     case 'x': return C_RegisterClass;
17725     case 't': return C_RegisterClass;
17726     case 'j': return C_Immediate; // Constant for movw.
17727     // An address with a single base register. Due to the way we
17728     // currently handle addresses it is the same as an 'r' memory constraint.
17729     case 'Q': return C_Memory;
17730     }
17731   } else if (S == 2) {
17732     switch (Constraint[0]) {
17733     default: break;
17734     case 'T': return C_RegisterClass;
17735     // All 'U+' constraints are addresses.
17736     case 'U': return C_Memory;
17737     }
17738   }
17739   return TargetLowering::getConstraintType(Constraint);
17740 }
17741 
17742 /// Examine constraint type and operand type and determine a weight value.
17743 /// This object must already have been set up with the operand type
17744 /// and the current alternative constraint selected.
17745 TargetLowering::ConstraintWeight
17746 ARMTargetLowering::getSingleConstraintMatchWeight(
17747     AsmOperandInfo &info, const char *constraint) const {
17748   ConstraintWeight weight = CW_Invalid;
17749   Value *CallOperandVal = info.CallOperandVal;
17750     // If we don't have a value, we can't do a match,
17751     // but allow it at the lowest weight.
17752   if (!CallOperandVal)
17753     return CW_Default;
17754   Type *type = CallOperandVal->getType();
17755   // Look at the constraint type.
17756   switch (*constraint) {
17757   default:
17758     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
17759     break;
17760   case 'l':
17761     if (type->isIntegerTy()) {
17762       if (Subtarget->isThumb())
17763         weight = CW_SpecificReg;
17764       else
17765         weight = CW_Register;
17766     }
17767     break;
17768   case 'w':
17769     if (type->isFloatingPointTy())
17770       weight = CW_Register;
17771     break;
17772   }
17773   return weight;
17774 }
17775 
17776 using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
17777 
17778 RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
17779     const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
17780   switch (Constraint.size()) {
17781   case 1:
17782     // GCC ARM Constraint Letters
17783     switch (Constraint[0]) {
17784     case 'l': // Low regs or general regs.
17785       if (Subtarget->isThumb())
17786         return RCPair(0U, &ARM::tGPRRegClass);
17787       return RCPair(0U, &ARM::GPRRegClass);
17788     case 'h': // High regs or no regs.
17789       if (Subtarget->isThumb())
17790         return RCPair(0U, &ARM::hGPRRegClass);
17791       break;
17792     case 'r':
17793       if (Subtarget->isThumb1Only())
17794         return RCPair(0U, &ARM::tGPRRegClass);
17795       return RCPair(0U, &ARM::GPRRegClass);
17796     case 'w':
17797       if (VT == MVT::Other)
17798         break;
17799       if (VT == MVT::f32)
17800         return RCPair(0U, &ARM::SPRRegClass);
17801       if (VT.getSizeInBits() == 64)
17802         return RCPair(0U, &ARM::DPRRegClass);
17803       if (VT.getSizeInBits() == 128)
17804         return RCPair(0U, &ARM::QPRRegClass);
17805       break;
17806     case 'x':
17807       if (VT == MVT::Other)
17808         break;
17809       if (VT == MVT::f32)
17810         return RCPair(0U, &ARM::SPR_8RegClass);
17811       if (VT.getSizeInBits() == 64)
17812         return RCPair(0U, &ARM::DPR_8RegClass);
17813       if (VT.getSizeInBits() == 128)
17814         return RCPair(0U, &ARM::QPR_8RegClass);
17815       break;
17816     case 't':
17817       if (VT == MVT::Other)
17818         break;
17819       if (VT == MVT::f32 || VT == MVT::i32)
17820         return RCPair(0U, &ARM::SPRRegClass);
17821       if (VT.getSizeInBits() == 64)
17822         return RCPair(0U, &ARM::DPR_VFP2RegClass);
17823       if (VT.getSizeInBits() == 128)
17824         return RCPair(0U, &ARM::QPR_VFP2RegClass);
17825       break;
17826     }
17827     break;
17828 
17829   case 2:
17830     if (Constraint[0] == 'T') {
17831       switch (Constraint[1]) {
17832       default:
17833         break;
17834       case 'e':
17835         return RCPair(0U, &ARM::tGPREvenRegClass);
17836       case 'o':
17837         return RCPair(0U, &ARM::tGPROddRegClass);
17838       }
17839     }
17840     break;
17841 
17842   default:
17843     break;
17844   }
17845 
17846   if (StringRef("{cc}").equals_lower(Constraint))
17847     return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
17848 
17849   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17850 }
17851 
17852 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
17853 /// vector.  If it is invalid, don't add anything to Ops.
17854 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
17855                                                      std::string &Constraint,
17856                                                      std::vector<SDValue>&Ops,
17857                                                      SelectionDAG &DAG) const {
17858   SDValue Result;
17859 
17860   // Currently only support length 1 constraints.
17861   if (Constraint.length() != 1) return;
17862 
17863   char ConstraintLetter = Constraint[0];
17864   switch (ConstraintLetter) {
17865   default: break;
17866   case 'j':
17867   case 'I': case 'J': case 'K': case 'L':
17868   case 'M': case 'N': case 'O':
17869     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
17870     if (!C)
17871       return;
17872 
17873     int64_t CVal64 = C->getSExtValue();
17874     int CVal = (int) CVal64;
17875     // None of these constraints allow values larger than 32 bits.  Check
17876     // that the value fits in an int.
17877     if (CVal != CVal64)
17878       return;
17879 
17880     switch (ConstraintLetter) {
17881       case 'j':
17882         // Constant suitable for movw, must be between 0 and
17883         // 65535.
17884         if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps()))
17885           if (CVal >= 0 && CVal <= 65535)
17886             break;
17887         return;
17888       case 'I':
17889         if (Subtarget->isThumb1Only()) {
17890           // This must be a constant between 0 and 255, for ADD
17891           // immediates.
17892           if (CVal >= 0 && CVal <= 255)
17893             break;
17894         } else if (Subtarget->isThumb2()) {
17895           // A constant that can be used as an immediate value in a
17896           // data-processing instruction.
17897           if (ARM_AM::getT2SOImmVal(CVal) != -1)
17898             break;
17899         } else {
17900           // A constant that can be used as an immediate value in a
17901           // data-processing instruction.
17902           if (ARM_AM::getSOImmVal(CVal) != -1)
17903             break;
17904         }
17905         return;
17906 
17907       case 'J':
17908         if (Subtarget->isThumb1Only()) {
17909           // This must be a constant between -255 and -1, for negated ADD
17910           // immediates. This can be used in GCC with an "n" modifier that
17911           // prints the negated value, for use with SUB instructions. It is
17912           // not useful otherwise but is implemented for compatibility.
17913           if (CVal >= -255 && CVal <= -1)
17914             break;
17915         } else {
17916           // This must be a constant between -4095 and 4095. It is not clear
17917           // what this constraint is intended for. Implemented for
17918           // compatibility with GCC.
17919           if (CVal >= -4095 && CVal <= 4095)
17920             break;
17921         }
17922         return;
17923 
17924       case 'K':
17925         if (Subtarget->isThumb1Only()) {
17926           // A 32-bit value where only one byte has a nonzero value. Exclude
17927           // zero to match GCC. This constraint is used by GCC internally for
17928           // constants that can be loaded with a move/shift combination.
17929           // It is not useful otherwise but is implemented for compatibility.
17930           if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
17931             break;
17932         } else if (Subtarget->isThumb2()) {
17933           // A constant whose bitwise inverse can be used as an immediate
17934           // value in a data-processing instruction. This can be used in GCC
17935           // with a "B" modifier that prints the inverted value, for use with
17936           // BIC and MVN instructions. It is not useful otherwise but is
17937           // implemented for compatibility.
17938           if (ARM_AM::getT2SOImmVal(~CVal) != -1)
17939             break;
17940         } else {
17941           // A constant whose bitwise inverse can be used as an immediate
17942           // value in a data-processing instruction. This can be used in GCC
17943           // with a "B" modifier that prints the inverted value, for use with
17944           // BIC and MVN instructions. It is not useful otherwise but is
17945           // implemented for compatibility.
17946           if (ARM_AM::getSOImmVal(~CVal) != -1)
17947             break;
17948         }
17949         return;
17950 
17951       case 'L':
17952         if (Subtarget->isThumb1Only()) {
17953           // This must be a constant between -7 and 7,
17954           // for 3-operand ADD/SUB immediate instructions.
17955           if (CVal >= -7 && CVal < 7)
17956             break;
17957         } else if (Subtarget->isThumb2()) {
17958           // A constant whose negation can be used as an immediate value in a
17959           // data-processing instruction. This can be used in GCC with an "n"
17960           // modifier that prints the negated value, for use with SUB
17961           // instructions. It is not useful otherwise but is implemented for
17962           // compatibility.
17963           if (ARM_AM::getT2SOImmVal(-CVal) != -1)
17964             break;
17965         } else {
17966           // A constant whose negation can be used as an immediate value in a
17967           // data-processing instruction. This can be used in GCC with an "n"
17968           // modifier that prints the negated value, for use with SUB
17969           // instructions. It is not useful otherwise but is implemented for
17970           // compatibility.
17971           if (ARM_AM::getSOImmVal(-CVal) != -1)
17972             break;
17973         }
17974         return;
17975 
17976       case 'M':
17977         if (Subtarget->isThumb1Only()) {
17978           // This must be a multiple of 4 between 0 and 1020, for
17979           // ADD sp + immediate.
17980           if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
17981             break;
17982         } else {
17983           // A power of two or a constant between 0 and 32.  This is used in
17984           // GCC for the shift amount on shifted register operands, but it is
17985           // useful in general for any shift amounts.
17986           if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
17987             break;
17988         }
17989         return;
17990 
17991       case 'N':
17992         if (Subtarget->isThumb1Only()) {
17993           // This must be a constant between 0 and 31, for shift amounts.
17994           if (CVal >= 0 && CVal <= 31)
17995             break;
17996         }
17997         return;
17998 
17999       case 'O':
18000         if (Subtarget->isThumb1Only()) {
18001           // This must be a multiple of 4 between -508 and 508, for
18002           // ADD/SUB sp = sp + immediate.
18003           if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
18004             break;
18005         }
18006         return;
18007     }
18008     Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType());
18009     break;
18010   }
18011 
18012   if (Result.getNode()) {
18013     Ops.push_back(Result);
18014     return;
18015   }
18016   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
18017 }
18018 
18019 static RTLIB::Libcall getDivRemLibcall(
18020     const SDNode *N, MVT::SimpleValueType SVT) {
18021   assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
18022           N->getOpcode() == ISD::SREM    || N->getOpcode() == ISD::UREM) &&
18023          "Unhandled Opcode in getDivRemLibcall");
18024   bool isSigned = N->getOpcode() == ISD::SDIVREM ||
18025                   N->getOpcode() == ISD::SREM;
18026   RTLIB::Libcall LC;
18027   switch (SVT) {
18028   default: llvm_unreachable("Unexpected request for libcall!");
18029   case MVT::i8:  LC = isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
18030   case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
18031   case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
18032   case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
18033   }
18034   return LC;
18035 }
18036 
18037 static TargetLowering::ArgListTy getDivRemArgList(
18038     const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
18039   assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
18040           N->getOpcode() == ISD::SREM    || N->getOpcode() == ISD::UREM) &&
18041          "Unhandled Opcode in getDivRemArgList");
18042   bool isSigned = N->getOpcode() == ISD::SDIVREM ||
18043                   N->getOpcode() == ISD::SREM;
18044   TargetLowering::ArgListTy Args;
18045   TargetLowering::ArgListEntry Entry;
18046   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
18047     EVT ArgVT = N->getOperand(i).getValueType();
18048     Type *ArgTy = ArgVT.getTypeForEVT(*Context);
18049     Entry.Node = N->getOperand(i);
18050     Entry.Ty = ArgTy;
18051     Entry.IsSExt = isSigned;
18052     Entry.IsZExt = !isSigned;
18053     Args.push_back(Entry);
18054   }
18055   if (Subtarget->isTargetWindows() && Args.size() >= 2)
18056     std::swap(Args[0], Args[1]);
18057   return Args;
18058 }
18059 
18060 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
18061   assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
18062           Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
18063           Subtarget->isTargetWindows()) &&
18064          "Register-based DivRem lowering only");
18065   unsigned Opcode = Op->getOpcode();
18066   assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
18067          "Invalid opcode for Div/Rem lowering");
18068   bool isSigned = (Opcode == ISD::SDIVREM);
18069   EVT VT = Op->getValueType(0);
18070   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
18071   SDLoc dl(Op);
18072 
18073   // If the target has hardware divide, use divide + multiply + subtract:
18074   //     div = a / b
18075   //     rem = a - b * div
18076   //     return {div, rem}
18077   // This should be lowered into UDIV/SDIV + MLS later on.
18078   bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
18079                                         : Subtarget->hasDivideInARMMode();
18080   if (hasDivide && Op->getValueType(0).isSimple() &&
18081       Op->getSimpleValueType(0) == MVT::i32) {
18082     unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
18083     const SDValue Dividend = Op->getOperand(0);
18084     const SDValue Divisor = Op->getOperand(1);
18085     SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
18086     SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
18087     SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
18088 
18089     SDValue Values[2] = {Div, Rem};
18090     return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
18091   }
18092 
18093   RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
18094                                        VT.getSimpleVT().SimpleTy);
18095   SDValue InChain = DAG.getEntryNode();
18096 
18097   TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(),
18098                                                     DAG.getContext(),
18099                                                     Subtarget);
18100 
18101   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
18102                                          getPointerTy(DAG.getDataLayout()));
18103 
18104   Type *RetTy = StructType::get(Ty, Ty);
18105 
18106   if (Subtarget->isTargetWindows())
18107     InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
18108 
18109   TargetLowering::CallLoweringInfo CLI(DAG);
18110   CLI.setDebugLoc(dl).setChain(InChain)
18111     .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
18112     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
18113 
18114   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
18115   return CallInfo.first;
18116 }
18117 
18118 // Lowers REM using divmod helpers
18119 // see RTABI section 4.2/4.3
18120 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
18121   // Build return types (div and rem)
18122   std::vector<Type*> RetTyParams;
18123   Type *RetTyElement;
18124 
18125   switch (N->getValueType(0).getSimpleVT().SimpleTy) {
18126   default: llvm_unreachable("Unexpected request for libcall!");
18127   case MVT::i8:   RetTyElement = Type::getInt8Ty(*DAG.getContext());  break;
18128   case MVT::i16:  RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
18129   case MVT::i32:  RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
18130   case MVT::i64:  RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
18131   }
18132 
18133   RetTyParams.push_back(RetTyElement);
18134   RetTyParams.push_back(RetTyElement);
18135   ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
18136   Type *RetTy = StructType::get(*DAG.getContext(), ret);
18137 
18138   RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
18139                                                              SimpleTy);
18140   SDValue InChain = DAG.getEntryNode();
18141   TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(),
18142                                                     Subtarget);
18143   bool isSigned = N->getOpcode() == ISD::SREM;
18144   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
18145                                          getPointerTy(DAG.getDataLayout()));
18146 
18147   if (Subtarget->isTargetWindows())
18148     InChain = WinDBZCheckDenominator(DAG, N, InChain);
18149 
18150   // Lower call
18151   CallLoweringInfo CLI(DAG);
18152   CLI.setChain(InChain)
18153      .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args))
18154      .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N));
18155   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
18156 
18157   // Return second (rem) result operand (first contains div)
18158   SDNode *ResNode = CallResult.first.getNode();
18159   assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
18160   return ResNode->getOperand(1);
18161 }
18162 
18163 SDValue
18164 ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
18165   assert(Subtarget->isTargetWindows() && "unsupported target platform");
18166   SDLoc DL(Op);
18167 
18168   // Get the inputs.
18169   SDValue Chain = Op.getOperand(0);
18170   SDValue Size  = Op.getOperand(1);
18171 
18172   if (DAG.getMachineFunction().getFunction().hasFnAttribute(
18173           "no-stack-arg-probe")) {
18174     MaybeAlign Align =
18175         cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
18176     SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
18177     Chain = SP.getValue(1);
18178     SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
18179     if (Align)
18180       SP =
18181           DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
18182                       DAG.getConstant(-(uint64_t)Align->value(), DL, MVT::i32));
18183     Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
18184     SDValue Ops[2] = { SP, Chain };
18185     return DAG.getMergeValues(Ops, DL);
18186   }
18187 
18188   SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
18189                               DAG.getConstant(2, DL, MVT::i32));
18190 
18191   SDValue Flag;
18192   Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag);
18193   Flag = Chain.getValue(1);
18194 
18195   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18196   Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag);
18197 
18198   SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
18199   Chain = NewSP.getValue(1);
18200 
18201   SDValue Ops[2] = { NewSP, Chain };
18202   return DAG.getMergeValues(Ops, DL);
18203 }
18204 
18205 SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
18206   bool IsStrict = Op->isStrictFPOpcode();
18207   SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
18208   const unsigned DstSz = Op.getValueType().getSizeInBits();
18209   const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
18210   assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&
18211          "Unexpected type for custom-lowering FP_EXTEND");
18212 
18213   assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
18214          "With both FP DP and 16, any FP conversion is legal!");
18215 
18216   assert(!(DstSz == 32 && Subtarget->hasFP16()) &&
18217          "With FP16, 16 to 32 conversion is legal!");
18218 
18219   // Converting from 32 -> 64 is valid if we have FP64.
18220   if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) {
18221     // FIXME: Remove this when we have strict fp instruction selection patterns
18222     if (IsStrict) {
18223       SDLoc Loc(Op);
18224       SDValue Result = DAG.getNode(ISD::FP_EXTEND,
18225                                    Loc, Op.getValueType(), SrcVal);
18226       return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
18227     }
18228     return Op;
18229   }
18230 
18231   // Either we are converting from 16 -> 64, without FP16 and/or
18232   // FP.double-precision or without Armv8-fp. So we must do it in two
18233   // steps.
18234   // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32
18235   // without FP16. So we must do a function call.
18236   SDLoc Loc(Op);
18237   RTLIB::Libcall LC;
18238   MakeLibCallOptions CallOptions;
18239   SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
18240   for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) {
18241     bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64());
18242     MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32);
18243     MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64);
18244     if (Supported) {
18245       if (IsStrict) {
18246         SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc,
18247                              {DstVT, MVT::Other}, {Chain, SrcVal});
18248         Chain = SrcVal.getValue(1);
18249       } else {
18250         SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal);
18251       }
18252     } else {
18253       LC = RTLIB::getFPEXT(SrcVT, DstVT);
18254       assert(LC != RTLIB::UNKNOWN_LIBCALL &&
18255              "Unexpected type for custom-lowering FP_EXTEND");
18256       std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
18257                                             Loc, Chain);
18258     }
18259   }
18260 
18261   return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal;
18262 }
18263 
18264 SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
18265   bool IsStrict = Op->isStrictFPOpcode();
18266 
18267   SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
18268   EVT SrcVT = SrcVal.getValueType();
18269   EVT DstVT = Op.getValueType();
18270   const unsigned DstSz = Op.getValueType().getSizeInBits();
18271   const unsigned SrcSz = SrcVT.getSizeInBits();
18272   (void)DstSz;
18273   assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 &&
18274          "Unexpected type for custom-lowering FP_ROUND");
18275 
18276   assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
18277          "With both FP DP and 16, any FP conversion is legal!");
18278 
18279   SDLoc Loc(Op);
18280 
18281   // Instruction from 32 -> 16 if hasFP16 is valid
18282   if (SrcSz == 32 && Subtarget->hasFP16())
18283     return Op;
18284 
18285   // Lib call from 32 -> 16 / 64 -> [32, 16]
18286   RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);
18287   assert(LC != RTLIB::UNKNOWN_LIBCALL &&
18288          "Unexpected type for custom-lowering FP_ROUND");
18289   MakeLibCallOptions CallOptions;
18290   SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
18291   SDValue Result;
18292   std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
18293                                         Loc, Chain);
18294   return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
18295 }
18296 
18297 void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results,
18298                                  SelectionDAG &DAG) const {
18299   assert(N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS.");
18300   MVT HalfT = MVT::i32;
18301   SDLoc dl(N);
18302   SDValue Hi, Lo, Tmp;
18303 
18304   if (!isOperationLegalOrCustom(ISD::ADDCARRY, HalfT) ||
18305       !isOperationLegalOrCustom(ISD::UADDO, HalfT))
18306     return ;
18307 
18308   unsigned OpTypeBits = HalfT.getScalarSizeInBits();
18309   SDVTList VTList = DAG.getVTList(HalfT, MVT::i1);
18310 
18311   Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
18312                    DAG.getConstant(0, dl, HalfT));
18313   Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
18314                    DAG.getConstant(1, dl, HalfT));
18315 
18316   Tmp = DAG.getNode(ISD::SRA, dl, HalfT, Hi,
18317                     DAG.getConstant(OpTypeBits - 1, dl,
18318                     getShiftAmountTy(HalfT, DAG.getDataLayout())));
18319   Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
18320   Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
18321                    SDValue(Lo.getNode(), 1));
18322   Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
18323   Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
18324 
18325   Results.push_back(Lo);
18326   Results.push_back(Hi);
18327 }
18328 
18329 bool
18330 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
18331   // The ARM target isn't yet aware of offsets.
18332   return false;
18333 }
18334 
18335 bool ARM::isBitFieldInvertedMask(unsigned v) {
18336   if (v == 0xffffffff)
18337     return false;
18338 
18339   // there can be 1's on either or both "outsides", all the "inside"
18340   // bits must be 0's
18341   return isShiftedMask_32(~v);
18342 }
18343 
18344 /// isFPImmLegal - Returns true if the target can instruction select the
18345 /// specified FP immediate natively. If false, the legalizer will
18346 /// materialize the FP immediate as a load from a constant pool.
18347 bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
18348                                      bool ForCodeSize) const {
18349   if (!Subtarget->hasVFP3Base())
18350     return false;
18351   if (VT == MVT::f16 && Subtarget->hasFullFP16())
18352     return ARM_AM::getFP16Imm(Imm) != -1;
18353   if (VT == MVT::f32 && Subtarget->hasFullFP16() &&
18354       ARM_AM::getFP32FP16Imm(Imm) != -1)
18355     return true;
18356   if (VT == MVT::f32)
18357     return ARM_AM::getFP32Imm(Imm) != -1;
18358   if (VT == MVT::f64 && Subtarget->hasFP64())
18359     return ARM_AM::getFP64Imm(Imm) != -1;
18360   return false;
18361 }
18362 
18363 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
18364 /// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
18365 /// specified in the intrinsic calls.
18366 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
18367                                            const CallInst &I,
18368                                            MachineFunction &MF,
18369                                            unsigned Intrinsic) const {
18370   switch (Intrinsic) {
18371   case Intrinsic::arm_neon_vld1:
18372   case Intrinsic::arm_neon_vld2:
18373   case Intrinsic::arm_neon_vld3:
18374   case Intrinsic::arm_neon_vld4:
18375   case Intrinsic::arm_neon_vld2lane:
18376   case Intrinsic::arm_neon_vld3lane:
18377   case Intrinsic::arm_neon_vld4lane:
18378   case Intrinsic::arm_neon_vld2dup:
18379   case Intrinsic::arm_neon_vld3dup:
18380   case Intrinsic::arm_neon_vld4dup: {
18381     Info.opc = ISD::INTRINSIC_W_CHAIN;
18382     // Conservatively set memVT to the entire set of vectors loaded.
18383     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
18384     uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
18385     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
18386     Info.ptrVal = I.getArgOperand(0);
18387     Info.offset = 0;
18388     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
18389     Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
18390     // volatile loads with NEON intrinsics not supported
18391     Info.flags = MachineMemOperand::MOLoad;
18392     return true;
18393   }
18394   case Intrinsic::arm_neon_vld1x2:
18395   case Intrinsic::arm_neon_vld1x3:
18396   case Intrinsic::arm_neon_vld1x4: {
18397     Info.opc = ISD::INTRINSIC_W_CHAIN;
18398     // Conservatively set memVT to the entire set of vectors loaded.
18399     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
18400     uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
18401     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
18402     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
18403     Info.offset = 0;
18404     Info.align.reset();
18405     // volatile loads with NEON intrinsics not supported
18406     Info.flags = MachineMemOperand::MOLoad;
18407     return true;
18408   }
18409   case Intrinsic::arm_neon_vst1:
18410   case Intrinsic::arm_neon_vst2:
18411   case Intrinsic::arm_neon_vst3:
18412   case Intrinsic::arm_neon_vst4:
18413   case Intrinsic::arm_neon_vst2lane:
18414   case Intrinsic::arm_neon_vst3lane:
18415   case Intrinsic::arm_neon_vst4lane: {
18416     Info.opc = ISD::INTRINSIC_VOID;
18417     // Conservatively set memVT to the entire set of vectors stored.
18418     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
18419     unsigned NumElts = 0;
18420     for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
18421       Type *ArgTy = I.getArgOperand(ArgI)->getType();
18422       if (!ArgTy->isVectorTy())
18423         break;
18424       NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
18425     }
18426     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
18427     Info.ptrVal = I.getArgOperand(0);
18428     Info.offset = 0;
18429     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
18430     Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
18431     // volatile stores with NEON intrinsics not supported
18432     Info.flags = MachineMemOperand::MOStore;
18433     return true;
18434   }
18435   case Intrinsic::arm_neon_vst1x2:
18436   case Intrinsic::arm_neon_vst1x3:
18437   case Intrinsic::arm_neon_vst1x4: {
18438     Info.opc = ISD::INTRINSIC_VOID;
18439     // Conservatively set memVT to the entire set of vectors stored.
18440     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
18441     unsigned NumElts = 0;
18442     for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
18443       Type *ArgTy = I.getArgOperand(ArgI)->getType();
18444       if (!ArgTy->isVectorTy())
18445         break;
18446       NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
18447     }
18448     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
18449     Info.ptrVal = I.getArgOperand(0);
18450     Info.offset = 0;
18451     Info.align.reset();
18452     // volatile stores with NEON intrinsics not supported
18453     Info.flags = MachineMemOperand::MOStore;
18454     return true;
18455   }
18456   case Intrinsic::arm_mve_vld2q:
18457   case Intrinsic::arm_mve_vld4q: {
18458     Info.opc = ISD::INTRINSIC_W_CHAIN;
18459     // Conservatively set memVT to the entire set of vectors loaded.
18460     Type *VecTy = cast<StructType>(I.getType())->getElementType(1);
18461     unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4;
18462     Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
18463     Info.ptrVal = I.getArgOperand(0);
18464     Info.offset = 0;
18465     Info.align = Align(VecTy->getScalarSizeInBits() / 8);
18466     // volatile loads with MVE intrinsics not supported
18467     Info.flags = MachineMemOperand::MOLoad;
18468     return true;
18469   }
18470   case Intrinsic::arm_mve_vst2q:
18471   case Intrinsic::arm_mve_vst4q: {
18472     Info.opc = ISD::INTRINSIC_VOID;
18473     // Conservatively set memVT to the entire set of vectors stored.
18474     Type *VecTy = I.getArgOperand(1)->getType();
18475     unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4;
18476     Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
18477     Info.ptrVal = I.getArgOperand(0);
18478     Info.offset = 0;
18479     Info.align = Align(VecTy->getScalarSizeInBits() / 8);
18480     // volatile stores with MVE intrinsics not supported
18481     Info.flags = MachineMemOperand::MOStore;
18482     return true;
18483   }
18484   case Intrinsic::arm_ldaex:
18485   case Intrinsic::arm_ldrex: {
18486     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
18487     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
18488     Info.opc = ISD::INTRINSIC_W_CHAIN;
18489     Info.memVT = MVT::getVT(PtrTy->getElementType());
18490     Info.ptrVal = I.getArgOperand(0);
18491     Info.offset = 0;
18492     Info.align = DL.getABITypeAlign(PtrTy->getElementType());
18493     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
18494     return true;
18495   }
18496   case Intrinsic::arm_stlex:
18497   case Intrinsic::arm_strex: {
18498     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
18499     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
18500     Info.opc = ISD::INTRINSIC_W_CHAIN;
18501     Info.memVT = MVT::getVT(PtrTy->getElementType());
18502     Info.ptrVal = I.getArgOperand(1);
18503     Info.offset = 0;
18504     Info.align = DL.getABITypeAlign(PtrTy->getElementType());
18505     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
18506     return true;
18507   }
18508   case Intrinsic::arm_stlexd:
18509   case Intrinsic::arm_strexd:
18510     Info.opc = ISD::INTRINSIC_W_CHAIN;
18511     Info.memVT = MVT::i64;
18512     Info.ptrVal = I.getArgOperand(2);
18513     Info.offset = 0;
18514     Info.align = Align(8);
18515     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
18516     return true;
18517 
18518   case Intrinsic::arm_ldaexd:
18519   case Intrinsic::arm_ldrexd:
18520     Info.opc = ISD::INTRINSIC_W_CHAIN;
18521     Info.memVT = MVT::i64;
18522     Info.ptrVal = I.getArgOperand(0);
18523     Info.offset = 0;
18524     Info.align = Align(8);
18525     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
18526     return true;
18527 
18528   default:
18529     break;
18530   }
18531 
18532   return false;
18533 }
18534 
18535 /// Returns true if it is beneficial to convert a load of a constant
18536 /// to just the constant itself.
18537 bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
18538                                                           Type *Ty) const {
18539   assert(Ty->isIntegerTy());
18540 
18541   unsigned Bits = Ty->getPrimitiveSizeInBits();
18542   if (Bits == 0 || Bits > 32)
18543     return false;
18544   return true;
18545 }
18546 
18547 bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
18548                                                 unsigned Index) const {
18549   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
18550     return false;
18551 
18552   return (Index == 0 || Index == ResVT.getVectorNumElements());
18553 }
18554 
18555 Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
18556                                         ARM_MB::MemBOpt Domain) const {
18557   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
18558 
18559   // First, if the target has no DMB, see what fallback we can use.
18560   if (!Subtarget->hasDataBarrier()) {
18561     // Some ARMv6 cpus can support data barriers with an mcr instruction.
18562     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
18563     // here.
18564     if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
18565       Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);
18566       Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
18567                         Builder.getInt32(0), Builder.getInt32(7),
18568                         Builder.getInt32(10), Builder.getInt32(5)};
18569       return Builder.CreateCall(MCR, args);
18570     } else {
18571       // Instead of using barriers, atomic accesses on these subtargets use
18572       // libcalls.
18573       llvm_unreachable("makeDMB on a target so old that it has no barriers");
18574     }
18575   } else {
18576     Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
18577     // Only a full system barrier exists in the M-class architectures.
18578     Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
18579     Constant *CDomain = Builder.getInt32(Domain);
18580     return Builder.CreateCall(DMB, CDomain);
18581   }
18582 }
18583 
18584 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
18585 Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
18586                                                  Instruction *Inst,
18587                                                  AtomicOrdering Ord) const {
18588   switch (Ord) {
18589   case AtomicOrdering::NotAtomic:
18590   case AtomicOrdering::Unordered:
18591     llvm_unreachable("Invalid fence: unordered/non-atomic");
18592   case AtomicOrdering::Monotonic:
18593   case AtomicOrdering::Acquire:
18594     return nullptr; // Nothing to do
18595   case AtomicOrdering::SequentiallyConsistent:
18596     if (!Inst->hasAtomicStore())
18597       return nullptr; // Nothing to do
18598     LLVM_FALLTHROUGH;
18599   case AtomicOrdering::Release:
18600   case AtomicOrdering::AcquireRelease:
18601     if (Subtarget->preferISHSTBarriers())
18602       return makeDMB(Builder, ARM_MB::ISHST);
18603     // FIXME: add a comment with a link to documentation justifying this.
18604     else
18605       return makeDMB(Builder, ARM_MB::ISH);
18606   }
18607   llvm_unreachable("Unknown fence ordering in emitLeadingFence");
18608 }
18609 
18610 Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
18611                                                   Instruction *Inst,
18612                                                   AtomicOrdering Ord) const {
18613   switch (Ord) {
18614   case AtomicOrdering::NotAtomic:
18615   case AtomicOrdering::Unordered:
18616     llvm_unreachable("Invalid fence: unordered/not-atomic");
18617   case AtomicOrdering::Monotonic:
18618   case AtomicOrdering::Release:
18619     return nullptr; // Nothing to do
18620   case AtomicOrdering::Acquire:
18621   case AtomicOrdering::AcquireRelease:
18622   case AtomicOrdering::SequentiallyConsistent:
18623     return makeDMB(Builder, ARM_MB::ISH);
18624   }
18625   llvm_unreachable("Unknown fence ordering in emitTrailingFence");
18626 }
18627 
18628 // Loads and stores less than 64-bits are already atomic; ones above that
18629 // are doomed anyway, so defer to the default libcall and blame the OS when
18630 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
18631 // anything for those.
18632 bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
18633   unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
18634   return (Size == 64) && !Subtarget->isMClass();
18635 }
18636 
18637 // Loads and stores less than 64-bits are already atomic; ones above that
18638 // are doomed anyway, so defer to the default libcall and blame the OS when
18639 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
18640 // anything for those.
18641 // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
18642 // guarantee, see DDI0406C ARM architecture reference manual,
18643 // sections A8.8.72-74 LDRD)
18644 TargetLowering::AtomicExpansionKind
18645 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
18646   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
18647   return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly
18648                                                   : AtomicExpansionKind::None;
18649 }
18650 
18651 // For the real atomic operations, we have ldrex/strex up to 32 bits,
18652 // and up to 64 bits on the non-M profiles
18653 TargetLowering::AtomicExpansionKind
18654 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
18655   if (AI->isFloatingPointOperation())
18656     return AtomicExpansionKind::CmpXChg;
18657 
18658   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
18659   bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
18660   return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW)
18661              ? AtomicExpansionKind::LLSC
18662              : AtomicExpansionKind::None;
18663 }
18664 
18665 // Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used  up to 32
18666 // bits, and up to 64 bits on the non-M profiles.
18667 TargetLowering::AtomicExpansionKind
18668 ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
18669   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
18670   // implement cmpxchg without spilling. If the address being exchanged is also
18671   // on the stack and close enough to the spill slot, this can lead to a
18672   // situation where the monitor always gets cleared and the atomic operation
18673   // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
18674   unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits();
18675   bool HasAtomicCmpXchg =
18676       !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
18677   if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg &&
18678       Size <= (Subtarget->isMClass() ? 32U : 64U))
18679     return AtomicExpansionKind::LLSC;
18680   return AtomicExpansionKind::None;
18681 }
18682 
18683 bool ARMTargetLowering::shouldInsertFencesForAtomic(
18684     const Instruction *I) const {
18685   return InsertFencesForAtomic;
18686 }
18687 
18688 // This has so far only been implemented for MachO.
18689 bool ARMTargetLowering::useLoadStackGuardNode() const {
18690   return Subtarget->isTargetMachO();
18691 }
18692 
18693 void ARMTargetLowering::insertSSPDeclarations(Module &M) const {
18694   if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
18695     return TargetLowering::insertSSPDeclarations(M);
18696 
18697   // MSVC CRT has a global variable holding security cookie.
18698   M.getOrInsertGlobal("__security_cookie",
18699                       Type::getInt8PtrTy(M.getContext()));
18700 
18701   // MSVC CRT has a function to validate security cookie.
18702   FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
18703       "__security_check_cookie", Type::getVoidTy(M.getContext()),
18704       Type::getInt8PtrTy(M.getContext()));
18705   if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
18706     F->addAttribute(1, Attribute::AttrKind::InReg);
18707 }
18708 
18709 Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const {
18710   // MSVC CRT has a global variable holding security cookie.
18711   if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
18712     return M.getGlobalVariable("__security_cookie");
18713   return TargetLowering::getSDagStackGuard(M);
18714 }
18715 
18716 Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const {
18717   // MSVC CRT has a function to validate security cookie.
18718   if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
18719     return M.getFunction("__security_check_cookie");
18720   return TargetLowering::getSSPStackGuardCheck(M);
18721 }
18722 
18723 bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
18724                                                   unsigned &Cost) const {
18725   // If we do not have NEON, vector types are not natively supported.
18726   if (!Subtarget->hasNEON())
18727     return false;
18728 
18729   // Floating point values and vector values map to the same register file.
18730   // Therefore, although we could do a store extract of a vector type, this is
18731   // better to leave at float as we have more freedom in the addressing mode for
18732   // those.
18733   if (VectorTy->isFPOrFPVectorTy())
18734     return false;
18735 
18736   // If the index is unknown at compile time, this is very expensive to lower
18737   // and it is not possible to combine the store with the extract.
18738   if (!isa<ConstantInt>(Idx))
18739     return false;
18740 
18741   assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
18742   unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedSize();
18743   // We can do a store + vector extract on any vector that fits perfectly in a D
18744   // or Q register.
18745   if (BitWidth == 64 || BitWidth == 128) {
18746     Cost = 0;
18747     return true;
18748   }
18749   return false;
18750 }
18751 
18752 bool ARMTargetLowering::isCheapToSpeculateCttz() const {
18753   return Subtarget->hasV6T2Ops();
18754 }
18755 
18756 bool ARMTargetLowering::isCheapToSpeculateCtlz() const {
18757   return Subtarget->hasV6T2Ops();
18758 }
18759 
18760 bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const {
18761   return !Subtarget->hasMinSize() || Subtarget->isTargetWindows();
18762 }
18763 
18764 Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
18765                                          AtomicOrdering Ord) const {
18766   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
18767   Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
18768   bool IsAcquire = isAcquireOrStronger(Ord);
18769 
18770   // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
18771   // intrinsic must return {i32, i32} and we have to recombine them into a
18772   // single i64 here.
18773   if (ValTy->getPrimitiveSizeInBits() == 64) {
18774     Intrinsic::ID Int =
18775         IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
18776     Function *Ldrex = Intrinsic::getDeclaration(M, Int);
18777 
18778     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
18779     Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");
18780 
18781     Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
18782     Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
18783     if (!Subtarget->isLittle())
18784       std::swap (Lo, Hi);
18785     Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
18786     Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
18787     return Builder.CreateOr(
18788         Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64");
18789   }
18790 
18791   Type *Tys[] = { Addr->getType() };
18792   Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
18793   Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys);
18794 
18795   return Builder.CreateTruncOrBitCast(
18796       Builder.CreateCall(Ldrex, Addr),
18797       cast<PointerType>(Addr->getType())->getElementType());
18798 }
18799 
18800 void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
18801     IRBuilder<> &Builder) const {
18802   if (!Subtarget->hasV7Ops())
18803     return;
18804   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
18805   Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
18806 }
18807 
18808 Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
18809                                                Value *Addr,
18810                                                AtomicOrdering Ord) const {
18811   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
18812   bool IsRelease = isReleaseOrStronger(Ord);
18813 
18814   // Since the intrinsics must have legal type, the i64 intrinsics take two
18815   // parameters: "i32, i32". We must marshal Val into the appropriate form
18816   // before the call.
18817   if (Val->getType()->getPrimitiveSizeInBits() == 64) {
18818     Intrinsic::ID Int =
18819         IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
18820     Function *Strex = Intrinsic::getDeclaration(M, Int);
18821     Type *Int32Ty = Type::getInt32Ty(M->getContext());
18822 
18823     Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
18824     Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
18825     if (!Subtarget->isLittle())
18826       std::swap(Lo, Hi);
18827     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
18828     return Builder.CreateCall(Strex, {Lo, Hi, Addr});
18829   }
18830 
18831   Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
18832   Type *Tys[] = { Addr->getType() };
18833   Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);
18834 
18835   return Builder.CreateCall(
18836       Strex, {Builder.CreateZExtOrBitCast(
18837                   Val, Strex->getFunctionType()->getParamType(0)),
18838               Addr});
18839 }
18840 
18841 
18842 bool ARMTargetLowering::alignLoopsWithOptSize() const {
18843   return Subtarget->isMClass();
18844 }
18845 
18846 /// A helper function for determining the number of interleaved accesses we
18847 /// will generate when lowering accesses of the given type.
18848 unsigned
18849 ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
18850                                              const DataLayout &DL) const {
18851   return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
18852 }
18853 
18854 bool ARMTargetLowering::isLegalInterleavedAccessType(
18855     unsigned Factor, FixedVectorType *VecTy, const DataLayout &DL) const {
18856 
18857   unsigned VecSize = DL.getTypeSizeInBits(VecTy);
18858   unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
18859 
18860   if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps())
18861     return false;
18862 
18863   // Ensure the vector doesn't have f16 elements. Even though we could do an
18864   // i16 vldN, we can't hold the f16 vectors and will end up converting via
18865   // f32.
18866   if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy())
18867     return false;
18868   if (Subtarget->hasMVEIntegerOps() && Factor == 3)
18869     return false;
18870 
18871   // Ensure the number of vector elements is greater than 1.
18872   if (VecTy->getNumElements() < 2)
18873     return false;
18874 
18875   // Ensure the element type is legal.
18876   if (ElSize != 8 && ElSize != 16 && ElSize != 32)
18877     return false;
18878 
18879   // Ensure the total vector size is 64 or a multiple of 128. Types larger than
18880   // 128 will be split into multiple interleaved accesses.
18881   if (Subtarget->hasNEON() && VecSize == 64)
18882     return true;
18883   return VecSize % 128 == 0;
18884 }
18885 
18886 unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
18887   if (Subtarget->hasNEON())
18888     return 4;
18889   if (Subtarget->hasMVEIntegerOps())
18890     return MVEMaxSupportedInterleaveFactor;
18891   return TargetLoweringBase::getMaxSupportedInterleaveFactor();
18892 }
18893 
18894 /// Lower an interleaved load into a vldN intrinsic.
18895 ///
18896 /// E.g. Lower an interleaved load (Factor = 2):
18897 ///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
18898 ///        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
18899 ///        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
18900 ///
18901 ///      Into:
18902 ///        %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
18903 ///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
18904 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
18905 bool ARMTargetLowering::lowerInterleavedLoad(
18906     LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
18907     ArrayRef<unsigned> Indices, unsigned Factor) const {
18908   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
18909          "Invalid interleave factor");
18910   assert(!Shuffles.empty() && "Empty shufflevector input");
18911   assert(Shuffles.size() == Indices.size() &&
18912          "Unmatched number of shufflevectors and indices");
18913 
18914   auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
18915   Type *EltTy = VecTy->getElementType();
18916 
18917   const DataLayout &DL = LI->getModule()->getDataLayout();
18918 
18919   // Skip if we do not have NEON and skip illegal vector types. We can
18920   // "legalize" wide vector types into multiple interleaved accesses as long as
18921   // the vector types are divisible by 128.
18922   if (!isLegalInterleavedAccessType(Factor, VecTy, DL))
18923     return false;
18924 
18925   unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
18926 
18927   // A pointer vector can not be the return type of the ldN intrinsics. Need to
18928   // load integer vectors first and then convert to pointer vectors.
18929   if (EltTy->isPointerTy())
18930     VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy);
18931 
18932   IRBuilder<> Builder(LI);
18933 
18934   // The base address of the load.
18935   Value *BaseAddr = LI->getPointerOperand();
18936 
18937   if (NumLoads > 1) {
18938     // If we're going to generate more than one load, reset the sub-vector type
18939     // to something legal.
18940     VecTy = FixedVectorType::get(VecTy->getElementType(),
18941                                  VecTy->getNumElements() / NumLoads);
18942 
18943     // We will compute the pointer operand of each load from the original base
18944     // address using GEPs. Cast the base address to a pointer to the scalar
18945     // element type.
18946     BaseAddr = Builder.CreateBitCast(
18947         BaseAddr,
18948         VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()));
18949   }
18950 
18951   assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
18952 
18953   auto createLoadIntrinsic = [&](Value *BaseAddr) {
18954     if (Subtarget->hasNEON()) {
18955       Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
18956       Type *Tys[] = {VecTy, Int8Ptr};
18957       static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
18958                                                 Intrinsic::arm_neon_vld3,
18959                                                 Intrinsic::arm_neon_vld4};
18960       Function *VldnFunc =
18961           Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
18962 
18963       SmallVector<Value *, 2> Ops;
18964       Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
18965       Ops.push_back(Builder.getInt32(LI->getAlignment()));
18966 
18967       return Builder.CreateCall(VldnFunc, Ops, "vldN");
18968     } else {
18969       assert((Factor == 2 || Factor == 4) &&
18970              "expected interleave factor of 2 or 4 for MVE");
18971       Intrinsic::ID LoadInts =
18972           Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;
18973       Type *VecEltTy =
18974           VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace());
18975       Type *Tys[] = {VecTy, VecEltTy};
18976       Function *VldnFunc =
18977           Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys);
18978 
18979       SmallVector<Value *, 2> Ops;
18980       Ops.push_back(Builder.CreateBitCast(BaseAddr, VecEltTy));
18981       return Builder.CreateCall(VldnFunc, Ops, "vldN");
18982     }
18983   };
18984 
18985   // Holds sub-vectors extracted from the load intrinsic return values. The
18986   // sub-vectors are associated with the shufflevector instructions they will
18987   // replace.
18988   DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
18989 
18990   for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
18991     // If we're generating more than one load, compute the base address of
18992     // subsequent loads as an offset from the previous.
18993     if (LoadCount > 0)
18994       BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr,
18995                                             VecTy->getNumElements() * Factor);
18996 
18997     CallInst *VldN = createLoadIntrinsic(BaseAddr);
18998 
18999     // Replace uses of each shufflevector with the corresponding vector loaded
19000     // by ldN.
19001     for (unsigned i = 0; i < Shuffles.size(); i++) {
19002       ShuffleVectorInst *SV = Shuffles[i];
19003       unsigned Index = Indices[i];
19004 
19005       Value *SubVec = Builder.CreateExtractValue(VldN, Index);
19006 
19007       // Convert the integer vector to pointer vector if the element is pointer.
19008       if (EltTy->isPointerTy())
19009         SubVec = Builder.CreateIntToPtr(
19010             SubVec,
19011             FixedVectorType::get(SV->getType()->getElementType(), VecTy));
19012 
19013       SubVecs[SV].push_back(SubVec);
19014     }
19015   }
19016 
19017   // Replace uses of the shufflevector instructions with the sub-vectors
19018   // returned by the load intrinsic. If a shufflevector instruction is
19019   // associated with more than one sub-vector, those sub-vectors will be
19020   // concatenated into a single wide vector.
19021   for (ShuffleVectorInst *SVI : Shuffles) {
19022     auto &SubVec = SubVecs[SVI];
19023     auto *WideVec =
19024         SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
19025     SVI->replaceAllUsesWith(WideVec);
19026   }
19027 
19028   return true;
19029 }
19030 
19031 /// Lower an interleaved store into a vstN intrinsic.
19032 ///
19033 /// E.g. Lower an interleaved store (Factor = 3):
19034 ///        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
19035 ///                                  <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
19036 ///        store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
19037 ///
19038 ///      Into:
19039 ///        %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
19040 ///        %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
19041 ///        %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
19042 ///        call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
19043 ///
19044 /// Note that the new shufflevectors will be removed and we'll only generate one
19045 /// vst3 instruction in CodeGen.
19046 ///
19047 /// Example for a more general valid mask (Factor 3). Lower:
19048 ///        %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
19049 ///                 <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
19050 ///        store <12 x i32> %i.vec, <12 x i32>* %ptr
19051 ///
19052 ///      Into:
19053 ///        %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
19054 ///        %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
19055 ///        %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
19056 ///        call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
19057 bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
19058                                               ShuffleVectorInst *SVI,
19059                                               unsigned Factor) const {
19060   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
19061          "Invalid interleave factor");
19062 
19063   auto *VecTy = cast<FixedVectorType>(SVI->getType());
19064   assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
19065 
19066   unsigned LaneLen = VecTy->getNumElements() / Factor;
19067   Type *EltTy = VecTy->getElementType();
19068   auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
19069 
19070   const DataLayout &DL = SI->getModule()->getDataLayout();
19071 
19072   // Skip if we do not have NEON and skip illegal vector types. We can
19073   // "legalize" wide vector types into multiple interleaved accesses as long as
19074   // the vector types are divisible by 128.
19075   if (!isLegalInterleavedAccessType(Factor, SubVecTy, DL))
19076     return false;
19077 
19078   unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
19079 
19080   Value *Op0 = SVI->getOperand(0);
19081   Value *Op1 = SVI->getOperand(1);
19082   IRBuilder<> Builder(SI);
19083 
19084   // StN intrinsics don't support pointer vectors as arguments. Convert pointer
19085   // vectors to integer vectors.
19086   if (EltTy->isPointerTy()) {
19087     Type *IntTy = DL.getIntPtrType(EltTy);
19088 
19089     // Convert to the corresponding integer vector.
19090     auto *IntVecTy =
19091         FixedVectorType::get(IntTy, cast<FixedVectorType>(Op0->getType()));
19092     Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
19093     Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
19094 
19095     SubVecTy = FixedVectorType::get(IntTy, LaneLen);
19096   }
19097 
19098   // The base address of the store.
19099   Value *BaseAddr = SI->getPointerOperand();
19100 
19101   if (NumStores > 1) {
19102     // If we're going to generate more than one store, reset the lane length
19103     // and sub-vector type to something legal.
19104     LaneLen /= NumStores;
19105     SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
19106 
19107     // We will compute the pointer operand of each store from the original base
19108     // address using GEPs. Cast the base address to a pointer to the scalar
19109     // element type.
19110     BaseAddr = Builder.CreateBitCast(
19111         BaseAddr,
19112         SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace()));
19113   }
19114 
19115   assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
19116 
19117   auto Mask = SVI->getShuffleMask();
19118 
19119   auto createStoreIntrinsic = [&](Value *BaseAddr,
19120                                   SmallVectorImpl<Value *> &Shuffles) {
19121     if (Subtarget->hasNEON()) {
19122       static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
19123                                                  Intrinsic::arm_neon_vst3,
19124                                                  Intrinsic::arm_neon_vst4};
19125       Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
19126       Type *Tys[] = {Int8Ptr, SubVecTy};
19127 
19128       Function *VstNFunc = Intrinsic::getDeclaration(
19129           SI->getModule(), StoreInts[Factor - 2], Tys);
19130 
19131       SmallVector<Value *, 6> Ops;
19132       Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
19133       append_range(Ops, Shuffles);
19134       Ops.push_back(Builder.getInt32(SI->getAlignment()));
19135       Builder.CreateCall(VstNFunc, Ops);
19136     } else {
19137       assert((Factor == 2 || Factor == 4) &&
19138              "expected interleave factor of 2 or 4 for MVE");
19139       Intrinsic::ID StoreInts =
19140           Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;
19141       Type *EltPtrTy = SubVecTy->getElementType()->getPointerTo(
19142           SI->getPointerAddressSpace());
19143       Type *Tys[] = {EltPtrTy, SubVecTy};
19144       Function *VstNFunc =
19145           Intrinsic::getDeclaration(SI->getModule(), StoreInts, Tys);
19146 
19147       SmallVector<Value *, 6> Ops;
19148       Ops.push_back(Builder.CreateBitCast(BaseAddr, EltPtrTy));
19149       append_range(Ops, Shuffles);
19150       for (unsigned F = 0; F < Factor; F++) {
19151         Ops.push_back(Builder.getInt32(F));
19152         Builder.CreateCall(VstNFunc, Ops);
19153         Ops.pop_back();
19154       }
19155     }
19156   };
19157 
19158   for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
19159     // If we generating more than one store, we compute the base address of
19160     // subsequent stores as an offset from the previous.
19161     if (StoreCount > 0)
19162       BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
19163                                             BaseAddr, LaneLen * Factor);
19164 
19165     SmallVector<Value *, 4> Shuffles;
19166 
19167     // Split the shufflevector operands into sub vectors for the new vstN call.
19168     for (unsigned i = 0; i < Factor; i++) {
19169       unsigned IdxI = StoreCount * LaneLen * Factor + i;
19170       if (Mask[IdxI] >= 0) {
19171         Shuffles.push_back(Builder.CreateShuffleVector(
19172             Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
19173       } else {
19174         unsigned StartMask = 0;
19175         for (unsigned j = 1; j < LaneLen; j++) {
19176           unsigned IdxJ = StoreCount * LaneLen * Factor + j;
19177           if (Mask[IdxJ * Factor + IdxI] >= 0) {
19178             StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
19179             break;
19180           }
19181         }
19182         // Note: If all elements in a chunk are undefs, StartMask=0!
19183         // Note: Filling undef gaps with random elements is ok, since
19184         // those elements were being written anyway (with undefs).
19185         // In the case of all undefs we're defaulting to using elems from 0
19186         // Note: StartMask cannot be negative, it's checked in
19187         // isReInterleaveMask
19188         Shuffles.push_back(Builder.CreateShuffleVector(
19189             Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
19190       }
19191     }
19192 
19193     createStoreIntrinsic(BaseAddr, Shuffles);
19194   }
19195   return true;
19196 }
19197 
19198 enum HABaseType {
19199   HA_UNKNOWN = 0,
19200   HA_FLOAT,
19201   HA_DOUBLE,
19202   HA_VECT64,
19203   HA_VECT128
19204 };
19205 
19206 static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
19207                                    uint64_t &Members) {
19208   if (auto *ST = dyn_cast<StructType>(Ty)) {
19209     for (unsigned i = 0; i < ST->getNumElements(); ++i) {
19210       uint64_t SubMembers = 0;
19211       if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
19212         return false;
19213       Members += SubMembers;
19214     }
19215   } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
19216     uint64_t SubMembers = 0;
19217     if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
19218       return false;
19219     Members += SubMembers * AT->getNumElements();
19220   } else if (Ty->isFloatTy()) {
19221     if (Base != HA_UNKNOWN && Base != HA_FLOAT)
19222       return false;
19223     Members = 1;
19224     Base = HA_FLOAT;
19225   } else if (Ty->isDoubleTy()) {
19226     if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
19227       return false;
19228     Members = 1;
19229     Base = HA_DOUBLE;
19230   } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
19231     Members = 1;
19232     switch (Base) {
19233     case HA_FLOAT:
19234     case HA_DOUBLE:
19235       return false;
19236     case HA_VECT64:
19237       return VT->getPrimitiveSizeInBits().getFixedSize() == 64;
19238     case HA_VECT128:
19239       return VT->getPrimitiveSizeInBits().getFixedSize() == 128;
19240     case HA_UNKNOWN:
19241       switch (VT->getPrimitiveSizeInBits().getFixedSize()) {
19242       case 64:
19243         Base = HA_VECT64;
19244         return true;
19245       case 128:
19246         Base = HA_VECT128;
19247         return true;
19248       default:
19249         return false;
19250       }
19251     }
19252   }
19253 
19254   return (Members > 0 && Members <= 4);
19255 }
19256 
19257 /// Return the correct alignment for the current calling convention.
19258 Align ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy,
19259                                                        DataLayout DL) const {
19260   const Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
19261   if (!ArgTy->isVectorTy())
19262     return ABITypeAlign;
19263 
19264   // Avoid over-aligning vector parameters. It would require realigning the
19265   // stack and waste space for no real benefit.
19266   return std::min(ABITypeAlign, DL.getStackAlignment());
19267 }
19268 
19269 /// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
19270 /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
19271 /// passing according to AAPCS rules.
19272 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
19273     Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
19274   if (getEffectiveCallingConv(CallConv, isVarArg) !=
19275       CallingConv::ARM_AAPCS_VFP)
19276     return false;
19277 
19278   HABaseType Base = HA_UNKNOWN;
19279   uint64_t Members = 0;
19280   bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
19281   LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
19282 
19283   bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
19284   return IsHA || IsIntArray;
19285 }
19286 
19287 Register ARMTargetLowering::getExceptionPointerRegister(
19288     const Constant *PersonalityFn) const {
19289   // Platforms which do not use SjLj EH may return values in these registers
19290   // via the personality function.
19291   return Subtarget->useSjLjEH() ? Register() : ARM::R0;
19292 }
19293 
19294 Register ARMTargetLowering::getExceptionSelectorRegister(
19295     const Constant *PersonalityFn) const {
19296   // Platforms which do not use SjLj EH may return values in these registers
19297   // via the personality function.
19298   return Subtarget->useSjLjEH() ? Register() : ARM::R1;
19299 }
19300 
19301 void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
19302   // Update IsSplitCSR in ARMFunctionInfo.
19303   ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
19304   AFI->setIsSplitCSR(true);
19305 }
19306 
19307 void ARMTargetLowering::insertCopiesSplitCSR(
19308     MachineBasicBlock *Entry,
19309     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
19310   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
19311   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
19312   if (!IStart)
19313     return;
19314 
19315   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
19316   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
19317   MachineBasicBlock::iterator MBBI = Entry->begin();
19318   for (const MCPhysReg *I = IStart; *I; ++I) {
19319     const TargetRegisterClass *RC = nullptr;
19320     if (ARM::GPRRegClass.contains(*I))
19321       RC = &ARM::GPRRegClass;
19322     else if (ARM::DPRRegClass.contains(*I))
19323       RC = &ARM::DPRRegClass;
19324     else
19325       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
19326 
19327     Register NewVR = MRI->createVirtualRegister(RC);
19328     // Create copy from CSR to a virtual register.
19329     // FIXME: this currently does not emit CFI pseudo-instructions, it works
19330     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
19331     // nounwind. If we want to generalize this later, we may need to emit
19332     // CFI pseudo-instructions.
19333     assert(Entry->getParent()->getFunction().hasFnAttribute(
19334                Attribute::NoUnwind) &&
19335            "Function should be nounwind in insertCopiesSplitCSR!");
19336     Entry->addLiveIn(*I);
19337     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
19338         .addReg(*I);
19339 
19340     // Insert the copy-back instructions right before the terminator.
19341     for (auto *Exit : Exits)
19342       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
19343               TII->get(TargetOpcode::COPY), *I)
19344           .addReg(NewVR);
19345   }
19346 }
19347 
19348 void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const {
19349   MF.getFrameInfo().computeMaxCallFrameSize(MF);
19350   TargetLoweringBase::finalizeLowering(MF);
19351 }
19352