xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp (revision 4c053c17f2c8a715988f215d16284879857ca376)
1 //===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AArch64.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AArch64LegalizerInfo.h"
15 #include "AArch64RegisterBankInfo.h"
16 #include "AArch64Subtarget.h"
17 #include "llvm/ADT/STLExtras.h"
18 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
19 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
20 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
21 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/Utils.h"
24 #include "llvm/CodeGen/MachineInstr.h"
25 #include "llvm/CodeGen/MachineRegisterInfo.h"
26 #include "llvm/CodeGen/TargetOpcodes.h"
27 #include "llvm/CodeGen/ValueTypes.h"
28 #include "llvm/IR/DerivedTypes.h"
29 #include "llvm/IR/Intrinsics.h"
30 #include "llvm/IR/IntrinsicsAArch64.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/MathExtras.h"
33 #include <initializer_list>
34 
35 #define DEBUG_TYPE "aarch64-legalinfo"
36 
37 using namespace llvm;
38 using namespace LegalizeActions;
39 using namespace LegalizeMutations;
40 using namespace LegalityPredicates;
41 using namespace MIPatternMatch;
42 
43 AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
44     : ST(&ST) {
45   using namespace TargetOpcode;
46   const LLT p0 = LLT::pointer(0, 64);
47   const LLT s8 = LLT::scalar(8);
48   const LLT s16 = LLT::scalar(16);
49   const LLT s32 = LLT::scalar(32);
50   const LLT s64 = LLT::scalar(64);
51   const LLT s128 = LLT::scalar(128);
52   const LLT v16s8 = LLT::fixed_vector(16, 8);
53   const LLT v8s8 = LLT::fixed_vector(8, 8);
54   const LLT v4s8 = LLT::fixed_vector(4, 8);
55   const LLT v8s16 = LLT::fixed_vector(8, 16);
56   const LLT v4s16 = LLT::fixed_vector(4, 16);
57   const LLT v2s16 = LLT::fixed_vector(2, 16);
58   const LLT v2s32 = LLT::fixed_vector(2, 32);
59   const LLT v4s32 = LLT::fixed_vector(4, 32);
60   const LLT v2s64 = LLT::fixed_vector(2, 64);
61   const LLT v2p0 = LLT::fixed_vector(2, p0);
62 
63   std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */
64                                                         v16s8, v8s16, v4s32,
65                                                         v2s64, v2p0,
66                                                         /* End 128bit types */
67                                                         /* Begin 64bit types */
68                                                         v8s8, v4s16, v2s32};
69   std::initializer_list<LLT> ScalarAndPtrTypesList = {s8, s16, s32, s64, p0};
70   SmallVector<LLT, 8> PackedVectorAllTypesVec(PackedVectorAllTypeList);
71   SmallVector<LLT, 8> ScalarAndPtrTypesVec(ScalarAndPtrTypesList);
72 
73   const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine();
74 
75   // FIXME: support subtargets which have neon/fp-armv8 disabled.
76   if (!ST.hasNEON() || !ST.hasFPARMv8()) {
77     getLegacyLegalizerInfo().computeTables();
78     return;
79   }
80 
81   // Some instructions only support s16 if the subtarget has full 16-bit FP
82   // support.
83   const bool HasFP16 = ST.hasFullFP16();
84   const LLT &MinFPScalar = HasFP16 ? s16 : s32;
85 
86   const bool HasCSSC = ST.hasCSSC();
87   const bool HasRCPC3 = ST.hasRCPC3();
88 
89   getActionDefinitionsBuilder(
90       {G_IMPLICIT_DEF, G_FREEZE, G_CONSTANT_FOLD_BARRIER})
91       .legalFor({p0, s8, s16, s32, s64})
92       .legalFor(PackedVectorAllTypeList)
93       .widenScalarToNextPow2(0)
94       .clampScalar(0, s8, s64)
95       .fewerElementsIf(
96           [=](const LegalityQuery &Query) {
97             return Query.Types[0].isVector() &&
98                    (Query.Types[0].getElementType() != s64 ||
99                     Query.Types[0].getNumElements() != 2);
100           },
101           [=](const LegalityQuery &Query) {
102             LLT EltTy = Query.Types[0].getElementType();
103             if (EltTy == s64)
104               return std::make_pair(0, LLT::fixed_vector(2, 64));
105             return std::make_pair(0, EltTy);
106           });
107 
108   getActionDefinitionsBuilder(G_PHI)
109       .legalFor({p0, s16, s32, s64})
110       .legalFor(PackedVectorAllTypeList)
111       .widenScalarToNextPow2(0)
112       .clampScalar(0, s16, s64)
113       // Maximum: sN * k = 128
114       .clampMaxNumElements(0, s8, 16)
115       .clampMaxNumElements(0, s16, 8)
116       .clampMaxNumElements(0, s32, 4)
117       .clampMaxNumElements(0, s64, 2)
118       .clampMaxNumElements(0, p0, 2);
119 
120   getActionDefinitionsBuilder(G_BSWAP)
121       .legalFor({s32, s64, v4s32, v2s32, v2s64})
122       .widenScalarToNextPow2(0)
123       .clampScalar(0, s32, s64);
124 
125   getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
126       .legalFor({s32, s64, v2s32, v2s64, v4s32, v4s16, v8s16, v16s8, v8s8})
127       .widenScalarToNextPow2(0)
128       .clampScalar(0, s32, s64)
129       .clampMaxNumElements(0, s8, 16)
130       .clampMaxNumElements(0, s16, 8)
131       .clampNumElements(0, v2s32, v4s32)
132       .clampNumElements(0, v2s64, v2s64)
133       .minScalarOrEltIf(
134           [=](const LegalityQuery &Query) {
135             return Query.Types[0].getNumElements() <= 2;
136           },
137           0, s32)
138       .minScalarOrEltIf(
139           [=](const LegalityQuery &Query) {
140             return Query.Types[0].getNumElements() <= 4;
141           },
142           0, s16)
143       .minScalarOrEltIf(
144           [=](const LegalityQuery &Query) {
145             return Query.Types[0].getNumElements() <= 16;
146           },
147           0, s8)
148       .moreElementsToNextPow2(0);
149 
150   getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR})
151       .customIf([=](const LegalityQuery &Query) {
152         const auto &SrcTy = Query.Types[0];
153         const auto &AmtTy = Query.Types[1];
154         return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
155                AmtTy.getSizeInBits() == 32;
156       })
157       .legalFor({
158           {s32, s32},
159           {s32, s64},
160           {s64, s64},
161           {v8s8, v8s8},
162           {v16s8, v16s8},
163           {v4s16, v4s16},
164           {v8s16, v8s16},
165           {v2s32, v2s32},
166           {v4s32, v4s32},
167           {v2s64, v2s64},
168       })
169       .widenScalarToNextPow2(0)
170       .clampScalar(1, s32, s64)
171       .clampScalar(0, s32, s64)
172       .clampNumElements(0, v8s8, v16s8)
173       .clampNumElements(0, v4s16, v8s16)
174       .clampNumElements(0, v2s32, v4s32)
175       .clampNumElements(0, v2s64, v2s64)
176       .moreElementsToNextPow2(0)
177       .minScalarSameAs(1, 0);
178 
179   getActionDefinitionsBuilder(G_PTR_ADD)
180       .legalFor({{p0, s64}, {v2p0, v2s64}})
181       .clampScalar(1, s64, s64);
182 
183   getActionDefinitionsBuilder(G_PTRMASK).legalFor({{p0, s64}});
184 
185   getActionDefinitionsBuilder({G_SDIV, G_UDIV})
186       .legalFor({s32, s64})
187       .libcallFor({s128})
188       .clampScalar(0, s32, s64)
189       .widenScalarToNextPow2(0)
190       .scalarize(0);
191 
192   getActionDefinitionsBuilder({G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
193       .lowerFor({s8, s16, s32, s64, v2s64, v4s32, v2s32})
194       .widenScalarOrEltToNextPow2(0)
195       .clampScalarOrElt(0, s32, s64)
196       .clampNumElements(0, v2s32, v4s32)
197       .clampNumElements(0, v2s64, v2s64)
198       .moreElementsToNextPow2(0);
199 
200 
201   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
202       .widenScalarToNextPow2(0, /*Min = */ 32)
203       .clampScalar(0, s32, s64)
204       .lower();
205 
206   getActionDefinitionsBuilder({G_SMULH, G_UMULH})
207       .legalFor({s64, v8s16, v16s8, v4s32})
208       .lower();
209 
210   auto &MinMaxActions = getActionDefinitionsBuilder(
211       {G_SMIN, G_SMAX, G_UMIN, G_UMAX});
212   if (HasCSSC)
213     MinMaxActions
214         .legalFor({s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
215         // Making clamping conditional on CSSC extension as without legal types we
216         // lower to CMP which can fold one of the two sxtb's we'd otherwise need
217         // if we detect a type smaller than 32-bit.
218         .minScalar(0, s32);
219   else
220     MinMaxActions
221         .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32});
222   MinMaxActions
223       .clampNumElements(0, v8s8, v16s8)
224       .clampNumElements(0, v4s16, v8s16)
225       .clampNumElements(0, v2s32, v4s32)
226       // FIXME: This sholdn't be needed as v2s64 types are going to
227       // be expanded anyway, but G_ICMP doesn't support splitting vectors yet
228       .clampNumElements(0, v2s64, v2s64)
229       .lower();
230 
231   getActionDefinitionsBuilder(
232       {G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO})
233       .legalFor({{s32, s32}, {s64, s32}})
234       .clampScalar(0, s32, s64)
235        .clampScalar(1, s32, s64)
236       .widenScalarToNextPow2(0);
237 
238   getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FMA, G_FNEG,
239                                G_FABS, G_FSQRT, G_FMAXNUM, G_FMINNUM,
240                                G_FMAXIMUM, G_FMINIMUM, G_FCEIL, G_FFLOOR,
241                                G_FRINT, G_FNEARBYINT, G_INTRINSIC_TRUNC,
242                                G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
243       .legalFor({MinFPScalar, s32, s64, v2s32, v4s32, v2s64})
244       .legalIf([=](const LegalityQuery &Query) {
245         const auto &Ty = Query.Types[0];
246         return (Ty == v8s16 || Ty == v4s16) && HasFP16;
247       })
248       .libcallFor({s128})
249       .minScalarOrElt(0, MinFPScalar)
250       .clampNumElements(0, v4s16, v8s16)
251       .clampNumElements(0, v2s32, v4s32)
252       .clampNumElements(0, v2s64, v2s64)
253       .moreElementsToNextPow2(0);
254 
255   getActionDefinitionsBuilder(G_FREM)
256       .libcallFor({s32, s64})
257       .minScalar(0, s32)
258       .scalarize(0);
259 
260   getActionDefinitionsBuilder(G_INTRINSIC_LRINT)
261       // If we don't have full FP16 support, then scalarize the elements of
262       // vectors containing fp16 types.
263       .fewerElementsIf(
264           [=, &ST](const LegalityQuery &Query) {
265             const auto &Ty = Query.Types[0];
266             return Ty.isVector() && Ty.getElementType() == s16 &&
267                    !ST.hasFullFP16();
268           },
269           [=](const LegalityQuery &Query) { return std::make_pair(0, s16); })
270       // If we don't have full FP16 support, then widen s16 to s32 if we
271       // encounter it.
272       .widenScalarIf(
273           [=, &ST](const LegalityQuery &Query) {
274             return Query.Types[0] == s16 && !ST.hasFullFP16();
275           },
276           [=](const LegalityQuery &Query) { return std::make_pair(0, s32); })
277       .legalFor({s16, s32, s64, v2s32, v4s32, v2s64, v2s16, v4s16, v8s16});
278 
279   getActionDefinitionsBuilder(
280       {G_FCOS, G_FSIN, G_FPOW, G_FLOG, G_FLOG2, G_FLOG10,
281        G_FEXP, G_FEXP2, G_FEXP10})
282       // We need a call for these, so we always need to scalarize.
283       .scalarize(0)
284       // Regardless of FP16 support, widen 16-bit elements to 32-bits.
285       .minScalar(0, s32)
286       .libcallFor({s32, s64});
287   getActionDefinitionsBuilder(G_FPOWI)
288       .scalarize(0)
289       .minScalar(0, s32)
290       .libcallFor({{s32, s32}, {s64, s32}});
291 
292   getActionDefinitionsBuilder(G_INSERT)
293       .legalIf(all(typeInSet(0, {s32, s64, p0}),
294                    typeInSet(1, {s8, s16, s32}), smallerThan(1, 0)))
295       .widenScalarToNextPow2(0)
296       .clampScalar(0, s32, s64)
297       .widenScalarToNextPow2(1)
298       .minScalar(1, s8)
299       .maxScalarIf(typeInSet(0, {s32}), 1, s16)
300       .maxScalarIf(typeInSet(0, {s64, p0}), 1, s32);
301 
302   getActionDefinitionsBuilder(G_EXTRACT)
303       .legalIf(all(typeInSet(0, {s16, s32, s64, p0}),
304                    typeInSet(1, {s32, s64, s128, p0}), smallerThan(0, 1)))
305       .widenScalarToNextPow2(1)
306       .clampScalar(1, s32, s128)
307       .widenScalarToNextPow2(0)
308       .minScalar(0, s16)
309       .maxScalarIf(typeInSet(1, {s32}), 0, s16)
310       .maxScalarIf(typeInSet(1, {s64, p0}), 0, s32)
311       .maxScalarIf(typeInSet(1, {s128}), 0, s64);
312 
313 
314   for (unsigned Op : {G_SEXTLOAD, G_ZEXTLOAD}) {
315     auto &Actions =  getActionDefinitionsBuilder(Op);
316 
317     if (Op == G_SEXTLOAD)
318       Actions.lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered));
319 
320     // Atomics have zero extending behavior.
321     Actions
322       .legalForTypesWithMemDesc({{s32, p0, s8, 8},
323                                  {s32, p0, s16, 8},
324                                  {s32, p0, s32, 8},
325                                  {s64, p0, s8, 2},
326                                  {s64, p0, s16, 2},
327                                  {s64, p0, s32, 4},
328                                  {s64, p0, s64, 8},
329                                  {p0, p0, s64, 8},
330                                  {v2s32, p0, s64, 8}})
331       .widenScalarToNextPow2(0)
332       .clampScalar(0, s32, s64)
333       // TODO: We could support sum-of-pow2's but the lowering code doesn't know
334       //       how to do that yet.
335       .unsupportedIfMemSizeNotPow2()
336       // Lower anything left over into G_*EXT and G_LOAD
337       .lower();
338   }
339 
340   auto IsPtrVecPred = [=](const LegalityQuery &Query) {
341     const LLT &ValTy = Query.Types[0];
342     if (!ValTy.isVector())
343       return false;
344     const LLT EltTy = ValTy.getElementType();
345     return EltTy.isPointer() && EltTy.getAddressSpace() == 0;
346   };
347 
348   getActionDefinitionsBuilder(G_LOAD)
349       .customIf([=](const LegalityQuery &Query) {
350         return HasRCPC3 && Query.Types[0] == s128 &&
351                Query.MMODescrs[0].Ordering == AtomicOrdering::Acquire;
352       })
353       .customIf([=](const LegalityQuery &Query) {
354         return Query.Types[0] == s128 &&
355                Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
356       })
357       .legalForTypesWithMemDesc({{s8, p0, s8, 8},
358                                  {s16, p0, s16, 8},
359                                  {s32, p0, s32, 8},
360                                  {s64, p0, s64, 8},
361                                  {p0, p0, s64, 8},
362                                  {s128, p0, s128, 8},
363                                  {v8s8, p0, s64, 8},
364                                  {v16s8, p0, s128, 8},
365                                  {v4s16, p0, s64, 8},
366                                  {v8s16, p0, s128, 8},
367                                  {v2s32, p0, s64, 8},
368                                  {v4s32, p0, s128, 8},
369                                  {v2s64, p0, s128, 8}})
370       // These extends are also legal
371       .legalForTypesWithMemDesc(
372           {{s32, p0, s8, 8}, {s32, p0, s16, 8}, {s64, p0, s32, 8}})
373       .widenScalarToNextPow2(0, /* MinSize = */ 8)
374       .lowerIfMemSizeNotByteSizePow2()
375       .clampScalar(0, s8, s64)
376       .narrowScalarIf(
377           [=](const LegalityQuery &Query) {
378             // Clamp extending load results to 32-bits.
379             return Query.Types[0].isScalar() &&
380                    Query.Types[0] != Query.MMODescrs[0].MemoryTy &&
381                    Query.Types[0].getSizeInBits() > 32;
382           },
383           changeTo(0, s32))
384       .clampMaxNumElements(0, s8, 16)
385       .clampMaxNumElements(0, s16, 8)
386       .clampMaxNumElements(0, s32, 4)
387       .clampMaxNumElements(0, s64, 2)
388       .clampMaxNumElements(0, p0, 2)
389       .customIf(IsPtrVecPred)
390       .scalarizeIf(typeIs(0, v2s16), 0);
391 
392   getActionDefinitionsBuilder(G_STORE)
393       .customIf([=](const LegalityQuery &Query) {
394         return HasRCPC3 && Query.Types[0] == s128 &&
395                Query.MMODescrs[0].Ordering == AtomicOrdering::Release;
396       })
397       .customIf([=](const LegalityQuery &Query) {
398         return Query.Types[0] == s128 &&
399                Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
400       })
401       .legalForTypesWithMemDesc(
402           {{s8, p0, s8, 8},     {s16, p0, s8, 8},  // truncstorei8 from s16
403            {s32, p0, s8, 8},                       // truncstorei8 from s32
404            {s64, p0, s8, 8},                       // truncstorei8 from s64
405            {s16, p0, s16, 8},   {s32, p0, s16, 8}, // truncstorei16 from s32
406            {s64, p0, s16, 8},                      // truncstorei16 from s64
407            {s32, p0, s8, 8},    {s32, p0, s16, 8},    {s32, p0, s32, 8},
408            {s64, p0, s64, 8},   {s64, p0, s32, 8}, // truncstorei32 from s64
409            {p0, p0, s64, 8},    {s128, p0, s128, 8},  {v16s8, p0, s128, 8},
410            {v8s8, p0, s64, 8},  {v4s16, p0, s64, 8},  {v8s16, p0, s128, 8},
411            {v2s32, p0, s64, 8}, {v4s32, p0, s128, 8}, {v2s64, p0, s128, 8}})
412       .clampScalar(0, s8, s64)
413       .lowerIf([=](const LegalityQuery &Query) {
414         return Query.Types[0].isScalar() &&
415                Query.Types[0] != Query.MMODescrs[0].MemoryTy;
416       })
417       // Maximum: sN * k = 128
418       .clampMaxNumElements(0, s8, 16)
419       .clampMaxNumElements(0, s16, 8)
420       .clampMaxNumElements(0, s32, 4)
421       .clampMaxNumElements(0, s64, 2)
422       .clampMaxNumElements(0, p0, 2)
423       .lowerIfMemSizeNotPow2()
424       .customIf(IsPtrVecPred)
425       .scalarizeIf(typeIs(0, v2s16), 0);
426 
427   getActionDefinitionsBuilder(G_INDEXED_STORE)
428       // Idx 0 == Ptr, Idx 1 == Val
429       // TODO: we can implement legalizations but as of now these are
430       // generated in a very specific way.
431       .legalForTypesWithMemDesc({
432           {p0, s8, s8, 8},
433           {p0, s16, s16, 8},
434           {p0, s32, s8, 8},
435           {p0, s32, s16, 8},
436           {p0, s32, s32, 8},
437           {p0, s64, s64, 8},
438           {p0, p0, p0, 8},
439           {p0, v8s8, v8s8, 8},
440           {p0, v16s8, v16s8, 8},
441           {p0, v4s16, v4s16, 8},
442           {p0, v8s16, v8s16, 8},
443           {p0, v2s32, v2s32, 8},
444           {p0, v4s32, v4s32, 8},
445           {p0, v2s64, v2s64, 8},
446           {p0, v2p0, v2p0, 8},
447           {p0, s128, s128, 8},
448       })
449       .unsupported();
450 
451   auto IndexedLoadBasicPred = [=](const LegalityQuery &Query) {
452     LLT LdTy = Query.Types[0];
453     LLT PtrTy = Query.Types[1];
454     if (!llvm::is_contained(PackedVectorAllTypesVec, LdTy) &&
455         !llvm::is_contained(ScalarAndPtrTypesVec, LdTy) && LdTy != s128)
456       return false;
457     if (PtrTy != p0)
458       return false;
459     return true;
460   };
461   getActionDefinitionsBuilder(G_INDEXED_LOAD)
462       .unsupportedIf(
463           atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered))
464       .legalIf(IndexedLoadBasicPred)
465       .unsupported();
466   getActionDefinitionsBuilder({G_INDEXED_SEXTLOAD, G_INDEXED_ZEXTLOAD})
467       .unsupportedIf(
468           atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered))
469       .legalIf(all(typeInSet(0, {s16, s32, s64}),
470                    LegalityPredicate([=](const LegalityQuery &Q) {
471                      LLT LdTy = Q.Types[0];
472                      LLT PtrTy = Q.Types[1];
473                      LLT MemTy = Q.MMODescrs[0].MemoryTy;
474                      if (PtrTy != p0)
475                        return false;
476                      if (LdTy == s16)
477                        return MemTy == s8;
478                      if (LdTy == s32)
479                        return MemTy == s8 || MemTy == s16;
480                      if (LdTy == s64)
481                        return MemTy == s8 || MemTy == s16 || MemTy == s32;
482                      return false;
483                    })))
484       .unsupported();
485 
486   // Constants
487   getActionDefinitionsBuilder(G_CONSTANT)
488       .legalFor({p0, s8, s16, s32, s64})
489       .widenScalarToNextPow2(0)
490       .clampScalar(0, s8, s64);
491   getActionDefinitionsBuilder(G_FCONSTANT)
492       .legalIf([=](const LegalityQuery &Query) {
493         const auto &Ty = Query.Types[0];
494         if (HasFP16 && Ty == s16)
495           return true;
496         return Ty == s32 || Ty == s64 || Ty == s128;
497       })
498       .clampScalar(0, MinFPScalar, s128);
499 
500   // FIXME: fix moreElementsToNextPow2
501   getActionDefinitionsBuilder(G_ICMP)
502       .legalFor({{s32, s32},
503                  {s32, s64},
504                  {s32, p0},
505                  {v4s32, v4s32},
506                  {v2s32, v2s32},
507                  {v2s64, v2s64},
508                  {v2s64, v2p0},
509                  {v4s16, v4s16},
510                  {v8s16, v8s16},
511                  {v8s8, v8s8},
512                  {v16s8, v16s8}})
513       .widenScalarOrEltToNextPow2(1)
514       .clampScalar(1, s32, s64)
515       .clampScalar(0, s32, s32)
516       .minScalarEltSameAsIf(
517           [=](const LegalityQuery &Query) {
518             const LLT &Ty = Query.Types[0];
519             const LLT &SrcTy = Query.Types[1];
520             return Ty.isVector() && !SrcTy.getElementType().isPointer() &&
521                    Ty.getElementType() != SrcTy.getElementType();
522           },
523           0, 1)
524       .minScalarOrEltIf(
525           [=](const LegalityQuery &Query) { return Query.Types[1] == v2s16; },
526           1, s32)
527       .minScalarOrEltIf(
528           [=](const LegalityQuery &Query) { return Query.Types[1] == v2p0; }, 0,
529           s64)
530       .moreElementsToNextPow2(0)
531       .clampNumElements(0, v8s8, v16s8)
532       .clampNumElements(0, v4s16, v8s16)
533       .clampNumElements(0, v2s32, v4s32)
534       .clampNumElements(0, v2s64, v2s64);
535 
536   getActionDefinitionsBuilder(G_FCMP)
537       // If we don't have full FP16 support, then scalarize the elements of
538       // vectors containing fp16 types.
539       .fewerElementsIf(
540           [=](const LegalityQuery &Query) {
541             const auto &Ty = Query.Types[0];
542             return Ty.isVector() && Ty.getElementType() == s16 && !HasFP16;
543           },
544           [=](const LegalityQuery &Query) { return std::make_pair(0, s16); })
545       // If we don't have full FP16 support, then widen s16 to s32 if we
546       // encounter it.
547       .widenScalarIf(
548           [=](const LegalityQuery &Query) {
549             return Query.Types[0] == s16 && !HasFP16;
550           },
551           [=](const LegalityQuery &Query) { return std::make_pair(0, s32); })
552       .legalFor({{s16, s16},
553                  {s32, s32},
554                  {s32, s64},
555                  {v4s32, v4s32},
556                  {v2s32, v2s32},
557                  {v2s64, v2s64},
558                  {v4s16, v4s16},
559                  {v8s16, v8s16}})
560       .widenScalarOrEltToNextPow2(1)
561       .clampScalar(1, s32, s64)
562       .clampScalar(0, s32, s32)
563       .minScalarEltSameAsIf(
564           [=](const LegalityQuery &Query) {
565             const LLT &Ty = Query.Types[0];
566             const LLT &SrcTy = Query.Types[1];
567             return Ty.isVector() && !SrcTy.getElementType().isPointer() &&
568                    Ty.getElementType() != SrcTy.getElementType();
569           },
570           0, 1)
571       .clampNumElements(0, v2s32, v4s32)
572       .clampMaxNumElements(1, s64, 2);
573 
574   // Extensions
575   auto ExtLegalFunc = [=](const LegalityQuery &Query) {
576     unsigned DstSize = Query.Types[0].getSizeInBits();
577 
578     // Handle legal vectors using legalFor
579     if (Query.Types[0].isVector())
580       return false;
581 
582     if (DstSize < 8 || DstSize >= 128 || !isPowerOf2_32(DstSize))
583       return false; // Extending to a scalar s128 needs narrowing.
584 
585     const LLT &SrcTy = Query.Types[1];
586 
587     // Make sure we fit in a register otherwise. Don't bother checking that
588     // the source type is below 128 bits. We shouldn't be allowing anything
589     // through which is wider than the destination in the first place.
590     unsigned SrcSize = SrcTy.getSizeInBits();
591     if (SrcSize < 8 || !isPowerOf2_32(SrcSize))
592       return false;
593 
594     return true;
595   };
596   getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
597       .legalIf(ExtLegalFunc)
598       .legalFor({{v2s64, v2s32}, {v4s32, v4s16}, {v8s16, v8s8}})
599       .clampScalar(0, s64, s64) // Just for s128, others are handled above.
600       .moreElementsToNextPow2(1)
601       .clampMaxNumElements(1, s8, 8)
602       .clampMaxNumElements(1, s16, 4)
603       .clampMaxNumElements(1, s32, 2)
604       // Tries to convert a large EXTEND into two smaller EXTENDs
605       .lowerIf([=](const LegalityQuery &Query) {
606         return (Query.Types[0].getScalarSizeInBits() >
607                 Query.Types[1].getScalarSizeInBits() * 2) &&
608                Query.Types[0].isVector() &&
609                (Query.Types[1].getScalarSizeInBits() == 8 ||
610                 Query.Types[1].getScalarSizeInBits() == 16);
611       });
612 
613   getActionDefinitionsBuilder(G_TRUNC)
614       .legalFor({{v2s32, v2s64}, {v4s16, v4s32}, {v8s8, v8s16}})
615       .moreElementsToNextPow2(0)
616       .clampMaxNumElements(0, s8, 8)
617       .clampMaxNumElements(0, s16, 4)
618       .clampMaxNumElements(0, s32, 2)
619       .minScalarOrEltIf(
620           [=](const LegalityQuery &Query) { return Query.Types[0].isVector(); },
621           0, s8)
622       .lowerIf([=](const LegalityQuery &Query) {
623         LLT DstTy = Query.Types[0];
624         LLT SrcTy = Query.Types[1];
625         return DstTy.isVector() && SrcTy.getSizeInBits() > 128 &&
626                DstTy.getScalarSizeInBits() * 2 <= SrcTy.getScalarSizeInBits();
627       })
628 
629       .alwaysLegal();
630 
631   getActionDefinitionsBuilder(G_SEXT_INREG)
632       .legalFor({s32, s64})
633       .legalFor(PackedVectorAllTypeList)
634       .maxScalar(0, s64)
635       .clampNumElements(0, v8s8, v16s8)
636       .clampNumElements(0, v4s16, v8s16)
637       .clampNumElements(0, v2s32, v4s32)
638       .clampMaxNumElements(0, s64, 2)
639       .lower();
640 
641   // FP conversions
642   getActionDefinitionsBuilder(G_FPTRUNC)
643       .legalFor(
644           {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}})
645       .clampNumElements(0, v4s16, v4s16)
646       .clampNumElements(0, v2s32, v2s32)
647       .scalarize(0);
648 
649   getActionDefinitionsBuilder(G_FPEXT)
650       .legalFor(
651           {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}})
652       .clampNumElements(0, v4s32, v4s32)
653       .clampNumElements(0, v2s64, v2s64)
654       .scalarize(0);
655 
656   // Conversions
657   getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
658       .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32})
659       .legalIf([=](const LegalityQuery &Query) {
660         return HasFP16 &&
661                (Query.Types[1] == s16 || Query.Types[1] == v4s16 ||
662                 Query.Types[1] == v8s16) &&
663                (Query.Types[0] == s32 || Query.Types[0] == s64 ||
664                 Query.Types[0] == v4s16 || Query.Types[0] == v8s16);
665       })
666       .widenScalarToNextPow2(0)
667       .clampScalar(0, s32, s64)
668       .widenScalarToNextPow2(1)
669       .clampScalarOrElt(1, MinFPScalar, s64)
670       .moreElementsToNextPow2(0)
671       .widenScalarIf(
672           [=](const LegalityQuery &Query) {
673             return Query.Types[0].getScalarSizeInBits() >
674                    Query.Types[1].getScalarSizeInBits();
675           },
676           LegalizeMutations::changeElementSizeTo(1, 0))
677       .widenScalarIf(
678           [=](const LegalityQuery &Query) {
679             return Query.Types[0].getScalarSizeInBits() <
680                    Query.Types[1].getScalarSizeInBits();
681           },
682           LegalizeMutations::changeElementSizeTo(0, 1))
683       .clampNumElements(0, v4s16, v8s16)
684       .clampNumElements(0, v2s32, v4s32)
685       .clampMaxNumElements(0, s64, 2);
686 
687   getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
688       .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32})
689       .legalIf([=](const LegalityQuery &Query) {
690         return HasFP16 &&
691                (Query.Types[0] == s16 || Query.Types[0] == v4s16 ||
692                 Query.Types[0] == v8s16) &&
693                (Query.Types[1] == s32 || Query.Types[1] == s64 ||
694                 Query.Types[1] == v4s16 || Query.Types[1] == v8s16);
695       })
696       .widenScalarToNextPow2(1)
697       .clampScalar(1, s32, s64)
698       .widenScalarToNextPow2(0)
699       .clampScalarOrElt(0, MinFPScalar, s64)
700       .moreElementsToNextPow2(0)
701       .widenScalarIf(
702           [=](const LegalityQuery &Query) {
703             return Query.Types[0].getScalarSizeInBits() <
704                    Query.Types[1].getScalarSizeInBits();
705           },
706           LegalizeMutations::changeElementSizeTo(0, 1))
707       .widenScalarIf(
708           [=](const LegalityQuery &Query) {
709             return Query.Types[0].getScalarSizeInBits() >
710                    Query.Types[1].getScalarSizeInBits();
711           },
712           LegalizeMutations::changeElementSizeTo(1, 0))
713       .clampNumElements(0, v4s16, v8s16)
714       .clampNumElements(0, v2s32, v4s32)
715       .clampMaxNumElements(0, s64, 2);
716 
717   // Control-flow
718   getActionDefinitionsBuilder(G_BRCOND)
719     .legalFor({s32})
720     .clampScalar(0, s32, s32);
721   getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0});
722 
723   getActionDefinitionsBuilder(G_SELECT)
724       .legalFor({{s32, s32}, {s64, s32}, {p0, s32}})
725       .widenScalarToNextPow2(0)
726       .clampScalar(0, s32, s64)
727       .clampScalar(1, s32, s32)
728       .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0)
729       .lowerIf(isVector(0));
730 
731   // Pointer-handling
732   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
733 
734   if (TM.getCodeModel() == CodeModel::Small)
735     getActionDefinitionsBuilder(G_GLOBAL_VALUE).custom();
736   else
737     getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
738 
739   getActionDefinitionsBuilder(G_PTRTOINT)
740       .legalFor({{s64, p0}, {v2s64, v2p0}})
741       .widenScalarToNextPow2(0, 64)
742       .clampScalar(0, s64, s64);
743 
744   getActionDefinitionsBuilder(G_INTTOPTR)
745       .unsupportedIf([&](const LegalityQuery &Query) {
746         return Query.Types[0].getSizeInBits() != Query.Types[1].getSizeInBits();
747       })
748       .legalFor({{p0, s64}, {v2p0, v2s64}});
749 
750   // Casts for 32 and 64-bit width type are just copies.
751   // Same for 128-bit width type, except they are on the FPR bank.
752   getActionDefinitionsBuilder(G_BITCAST)
753       // FIXME: This is wrong since G_BITCAST is not allowed to change the
754       // number of bits but it's what the previous code described and fixing
755       // it breaks tests.
756       .legalForCartesianProduct({s8, s16, s32, s64, s128, v16s8, v8s8, v4s8,
757                                  v8s16, v4s16, v2s16, v4s32, v2s32, v2s64,
758                                  v2p0});
759 
760   getActionDefinitionsBuilder(G_VASTART).legalFor({p0});
761 
762   // va_list must be a pointer, but most sized types are pretty easy to handle
763   // as the destination.
764   getActionDefinitionsBuilder(G_VAARG)
765       .customForCartesianProduct({s8, s16, s32, s64, p0}, {p0})
766       .clampScalar(0, s8, s64)
767       .widenScalarToNextPow2(0, /*Min*/ 8);
768 
769   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
770       .lowerIf(
771           all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0)));
772 
773   LegalityPredicate UseOutlineAtomics = [&ST](const LegalityQuery &Query) {
774     return ST.outlineAtomics() && !ST.hasLSE();
775   };
776 
777   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
778       .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0),
779                    predNot(UseOutlineAtomics)))
780       .customIf(all(typeIs(0, s128), predNot(UseOutlineAtomics)))
781       .customIf([UseOutlineAtomics](const LegalityQuery &Query) {
782         return Query.Types[0].getSizeInBits() == 128 &&
783                !UseOutlineAtomics(Query);
784       })
785       .libcallIf(all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(1, p0),
786                      UseOutlineAtomics))
787       .clampScalar(0, s32, s64);
788 
789   getActionDefinitionsBuilder({G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD,
790                                G_ATOMICRMW_SUB, G_ATOMICRMW_AND, G_ATOMICRMW_OR,
791                                G_ATOMICRMW_XOR})
792       .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0),
793                    predNot(UseOutlineAtomics)))
794       .libcallIf(all(typeInSet(0, {s8, s16, s32, s64}), typeIs(1, p0),
795                      UseOutlineAtomics))
796       .clampScalar(0, s32, s64);
797 
798   // Do not outline these atomics operations, as per comment in
799   // AArch64ISelLowering.cpp's shouldExpandAtomicRMWInIR().
800   getActionDefinitionsBuilder(
801       {G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX})
802       .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)))
803       .clampScalar(0, s32, s64);
804 
805   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({p0});
806 
807   // Merge/Unmerge
808   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
809     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
810     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
811     getActionDefinitionsBuilder(Op)
812         .widenScalarToNextPow2(LitTyIdx, 8)
813         .widenScalarToNextPow2(BigTyIdx, 32)
814         .clampScalar(LitTyIdx, s8, s64)
815         .clampScalar(BigTyIdx, s32, s128)
816         .legalIf([=](const LegalityQuery &Q) {
817           switch (Q.Types[BigTyIdx].getSizeInBits()) {
818           case 32:
819           case 64:
820           case 128:
821             break;
822           default:
823             return false;
824           }
825           switch (Q.Types[LitTyIdx].getSizeInBits()) {
826           case 8:
827           case 16:
828           case 32:
829           case 64:
830             return true;
831           default:
832             return false;
833           }
834         });
835   }
836 
837   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
838       .unsupportedIf([=](const LegalityQuery &Query) {
839         const LLT &EltTy = Query.Types[1].getElementType();
840         return Query.Types[0] != EltTy;
841       })
842       .minScalar(2, s64)
843       .customIf([=](const LegalityQuery &Query) {
844         const LLT &VecTy = Query.Types[1];
845         return VecTy == v2s16 || VecTy == v4s16 || VecTy == v8s16 ||
846                VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32 ||
847                VecTy == v8s8 || VecTy == v16s8 || VecTy == v2p0;
848       })
849       .minScalarOrEltIf(
850           [=](const LegalityQuery &Query) {
851             // We want to promote to <M x s1> to <M x s64> if that wouldn't
852             // cause the total vec size to be > 128b.
853             return Query.Types[1].getNumElements() <= 2;
854           },
855           0, s64)
856       .minScalarOrEltIf(
857           [=](const LegalityQuery &Query) {
858             return Query.Types[1].getNumElements() <= 4;
859           },
860           0, s32)
861       .minScalarOrEltIf(
862           [=](const LegalityQuery &Query) {
863             return Query.Types[1].getNumElements() <= 8;
864           },
865           0, s16)
866       .minScalarOrEltIf(
867           [=](const LegalityQuery &Query) {
868             return Query.Types[1].getNumElements() <= 16;
869           },
870           0, s8)
871       .minScalarOrElt(0, s8) // Worst case, we need at least s8.
872       .moreElementsToNextPow2(1)
873       .clampMaxNumElements(1, s64, 2)
874       .clampMaxNumElements(1, s32, 4)
875       .clampMaxNumElements(1, s16, 8)
876       .clampMaxNumElements(1, p0, 2);
877 
878   getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT)
879       .legalIf(typeInSet(0, {v16s8, v8s8, v8s16, v4s16, v4s32, v2s32, v2s64}))
880       .moreElementsToNextPow2(0)
881       .widenVectorEltsToVectorMinSize(0, 64);
882 
883   getActionDefinitionsBuilder(G_BUILD_VECTOR)
884       .legalFor({{v8s8, s8},
885                  {v16s8, s8},
886                  {v4s16, s16},
887                  {v8s16, s16},
888                  {v2s32, s32},
889                  {v4s32, s32},
890                  {v2p0, p0},
891                  {v2s64, s64}})
892       .clampNumElements(0, v4s32, v4s32)
893       .clampNumElements(0, v2s64, v2s64)
894       .minScalarOrElt(0, s8)
895       .widenVectorEltsToVectorMinSize(0, 64)
896       .minScalarSameAs(1, 0);
897 
898   getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC).lower();
899 
900   getActionDefinitionsBuilder(G_CTLZ)
901       .legalForCartesianProduct(
902           {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
903       .scalarize(1)
904       .widenScalarToNextPow2(1, /*Min=*/32)
905       .clampScalar(1, s32, s64)
906       .scalarSameSizeAs(0, 1);
907   getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).lower();
908 
909   // TODO: Custom lowering for v2s32, v4s32, v2s64.
910   getActionDefinitionsBuilder(G_BITREVERSE)
911       .legalFor({s32, s64, v8s8, v16s8})
912       .widenScalarToNextPow2(0, /*Min = */ 32)
913       .clampScalar(0, s32, s64);
914 
915   getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF).lower();
916 
917   getActionDefinitionsBuilder(G_CTTZ)
918       .lowerIf(isVector(0))
919       .widenScalarToNextPow2(1, /*Min=*/32)
920       .clampScalar(1, s32, s64)
921       .scalarSameSizeAs(0, 1)
922       .legalIf([=](const LegalityQuery &Query) {
923         return (HasCSSC && typeInSet(0, {s32, s64})(Query));
924       })
925       .customIf([=](const LegalityQuery &Query) {
926         return (!HasCSSC && typeInSet(0, {s32, s64})(Query));
927       });
928 
929   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
930       .legalIf([=](const LegalityQuery &Query) {
931         const LLT &DstTy = Query.Types[0];
932         const LLT &SrcTy = Query.Types[1];
933         // For now just support the TBL2 variant which needs the source vectors
934         // to be the same size as the dest.
935         if (DstTy != SrcTy)
936           return false;
937         return llvm::is_contained(
938             {v2s64, v2p0, v2s32, v4s32, v4s16, v16s8, v8s8, v8s16}, DstTy);
939       })
940       // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we
941       // just want those lowered into G_BUILD_VECTOR
942       .lowerIf([=](const LegalityQuery &Query) {
943         return !Query.Types[1].isVector();
944       })
945       .moreElementsIf(
946           [](const LegalityQuery &Query) {
947             return Query.Types[0].isVector() && Query.Types[1].isVector() &&
948                    Query.Types[0].getNumElements() >
949                        Query.Types[1].getNumElements();
950           },
951           changeTo(1, 0))
952       .moreElementsToNextPow2(0)
953       .clampNumElements(0, v4s32, v4s32)
954       .clampNumElements(0, v2s64, v2s64)
955       .moreElementsIf(
956           [](const LegalityQuery &Query) {
957             return Query.Types[0].isVector() && Query.Types[1].isVector() &&
958                    Query.Types[0].getNumElements() <
959                        Query.Types[1].getNumElements();
960           },
961           changeTo(0, 1));
962 
963   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
964       .legalFor({{v4s32, v2s32}, {v8s16, v4s16}, {v16s8, v8s8}});
965 
966   getActionDefinitionsBuilder(G_JUMP_TABLE).legalFor({p0});
967 
968   getActionDefinitionsBuilder(G_BRJT).legalFor({{p0, s64}});
969 
970   getActionDefinitionsBuilder(G_DYN_STACKALLOC).custom();
971 
972   getActionDefinitionsBuilder({G_STACKSAVE, G_STACKRESTORE}).lower();
973 
974   if (ST.hasMOPS()) {
975     // G_BZERO is not supported. Currently it is only emitted by
976     // PreLegalizerCombiner for G_MEMSET with zero constant.
977     getActionDefinitionsBuilder(G_BZERO).unsupported();
978 
979     getActionDefinitionsBuilder(G_MEMSET)
980         .legalForCartesianProduct({p0}, {s64}, {s64})
981         .customForCartesianProduct({p0}, {s8}, {s64})
982         .immIdx(0); // Inform verifier imm idx 0 is handled.
983 
984     getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE})
985         .legalForCartesianProduct({p0}, {p0}, {s64})
986         .immIdx(0); // Inform verifier imm idx 0 is handled.
987 
988     // G_MEMCPY_INLINE does not have a tailcall immediate
989     getActionDefinitionsBuilder(G_MEMCPY_INLINE)
990         .legalForCartesianProduct({p0}, {p0}, {s64});
991 
992   } else {
993     getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET})
994         .libcall();
995   }
996 
997   // FIXME: Legal vector types are only legal with NEON.
998   auto &ABSActions = getActionDefinitionsBuilder(G_ABS);
999   if (HasCSSC)
1000     ABSActions
1001         .legalFor({s32, s64});
1002   ABSActions
1003       .legalFor(PackedVectorAllTypeList)
1004       .lowerIf(isScalar(0));
1005 
1006   // For fadd reductions we have pairwise operations available. We treat the
1007   // usual legal types as legal and handle the lowering to pairwise instructions
1008   // later.
1009   getActionDefinitionsBuilder(G_VECREDUCE_FADD)
1010       .legalFor({{s32, v2s32}, {s32, v4s32}, {s64, v2s64}})
1011       .legalIf([=](const LegalityQuery &Query) {
1012         const auto &Ty = Query.Types[1];
1013         return (Ty == v4s16 || Ty == v8s16) && HasFP16;
1014       })
1015       .minScalarOrElt(0, MinFPScalar)
1016       .clampMaxNumElements(1, s64, 2)
1017       .clampMaxNumElements(1, s32, 4)
1018       .clampMaxNumElements(1, s16, 8)
1019       .lower();
1020 
1021   // For fmul reductions we need to split up into individual operations. We
1022   // clamp to 128 bit vectors then to 64bit vectors to produce a cascade of
1023   // smaller types, followed by scalarizing what remains.
1024   getActionDefinitionsBuilder(G_VECREDUCE_FMUL)
1025       .minScalarOrElt(0, MinFPScalar)
1026       .clampMaxNumElements(1, s64, 2)
1027       .clampMaxNumElements(1, s32, 4)
1028       .clampMaxNumElements(1, s16, 8)
1029       .clampMaxNumElements(1, s32, 2)
1030       .clampMaxNumElements(1, s16, 4)
1031       .scalarize(1)
1032       .lower();
1033 
1034   getActionDefinitionsBuilder({G_VECREDUCE_SEQ_FADD, G_VECREDUCE_SEQ_FMUL})
1035       .scalarize(2)
1036       .lower();
1037 
1038   getActionDefinitionsBuilder(G_VECREDUCE_ADD)
1039       .legalFor({{s8, v16s8},
1040                  {s8, v8s8},
1041                  {s16, v8s16},
1042                  {s16, v4s16},
1043                  {s32, v4s32},
1044                  {s32, v2s32},
1045                  {s64, v2s64}})
1046       .clampMaxNumElements(1, s64, 2)
1047       .clampMaxNumElements(1, s32, 4)
1048       .clampMaxNumElements(1, s16, 8)
1049       .clampMaxNumElements(1, s8, 16)
1050       .lower();
1051 
1052   getActionDefinitionsBuilder({G_VECREDUCE_FMIN, G_VECREDUCE_FMAX,
1053                                G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM})
1054       .legalFor({{s32, v4s32}, {s32, v2s32}, {s64, v2s64}})
1055       .legalIf([=](const LegalityQuery &Query) {
1056         const auto &Ty = Query.Types[1];
1057         return Query.Types[0] == s16 && (Ty == v8s16 || Ty == v4s16) && HasFP16;
1058       })
1059       .minScalarOrElt(0, MinFPScalar)
1060       .clampMaxNumElements(1, s64, 2)
1061       .clampMaxNumElements(1, s32, 4)
1062       .clampMaxNumElements(1, s16, 8)
1063       .lower();
1064 
1065   getActionDefinitionsBuilder(G_VECREDUCE_MUL)
1066       .clampMaxNumElements(1, s32, 2)
1067       .clampMaxNumElements(1, s16, 4)
1068       .clampMaxNumElements(1, s8, 8)
1069       .scalarize(1)
1070       .lower();
1071 
1072   getActionDefinitionsBuilder(
1073       {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX})
1074       .legalFor({{s8, v8s8},
1075                  {s8, v16s8},
1076                  {s16, v4s16},
1077                  {s16, v8s16},
1078                  {s32, v2s32},
1079                  {s32, v4s32}})
1080       .clampMaxNumElements(1, s64, 2)
1081       .clampMaxNumElements(1, s32, 4)
1082       .clampMaxNumElements(1, s16, 8)
1083       .clampMaxNumElements(1, s8, 16)
1084       .scalarize(1)
1085       .lower();
1086 
1087   getActionDefinitionsBuilder(
1088       {G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
1089       // Try to break down into smaller vectors as long as they're at least 64
1090       // bits. This lets us use vector operations for some parts of the
1091       // reduction.
1092       .fewerElementsIf(
1093           [=](const LegalityQuery &Q) {
1094             LLT SrcTy = Q.Types[1];
1095             if (SrcTy.isScalar())
1096               return false;
1097             if (!isPowerOf2_32(SrcTy.getNumElements()))
1098               return false;
1099             // We can usually perform 64b vector operations.
1100             return SrcTy.getSizeInBits() > 64;
1101           },
1102           [=](const LegalityQuery &Q) {
1103             LLT SrcTy = Q.Types[1];
1104             return std::make_pair(1, SrcTy.divide(2));
1105           })
1106       .scalarize(1)
1107       .lower();
1108 
1109   getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
1110       .lowerIf([=](const LegalityQuery &Q) { return Q.Types[0].isScalar(); });
1111 
1112   getActionDefinitionsBuilder({G_FSHL, G_FSHR})
1113       .customFor({{s32, s32}, {s32, s64}, {s64, s64}})
1114       .lower();
1115 
1116   getActionDefinitionsBuilder(G_ROTR)
1117       .legalFor({{s32, s64}, {s64, s64}})
1118       .customIf([=](const LegalityQuery &Q) {
1119         return Q.Types[0].isScalar() && Q.Types[1].getScalarSizeInBits() < 64;
1120       })
1121       .lower();
1122   getActionDefinitionsBuilder(G_ROTL).lower();
1123 
1124   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1125       .customFor({{s32, s32}, {s64, s64}});
1126 
1127   auto always = [=](const LegalityQuery &Q) { return true; };
1128   auto &CTPOPActions = getActionDefinitionsBuilder(G_CTPOP);
1129   if (HasCSSC)
1130     CTPOPActions
1131         .legalFor({{s32, s32},
1132                    {s64, s64},
1133                    {v8s8, v8s8},
1134                    {v16s8, v16s8}})
1135         .customFor({{s128, s128},
1136                     {v2s64, v2s64},
1137                     {v2s32, v2s32},
1138                     {v4s32, v4s32},
1139                     {v4s16, v4s16},
1140                     {v8s16, v8s16}});
1141   else
1142     CTPOPActions
1143         .legalFor({{v8s8, v8s8},
1144                    {v16s8, v16s8}})
1145         .customFor({{s32, s32},
1146                     {s64, s64},
1147                     {s128, s128},
1148                     {v2s64, v2s64},
1149                     {v2s32, v2s32},
1150                     {v4s32, v4s32},
1151                     {v4s16, v4s16},
1152                     {v8s16, v8s16}});
1153   CTPOPActions
1154       .clampScalar(0, s32, s128)
1155       .widenScalarToNextPow2(0)
1156       .minScalarEltSameAsIf(always, 1, 0)
1157       .maxScalarEltSameAsIf(always, 1, 0);
1158 
1159   // TODO: Vector types.
1160   getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}).lowerIf(isScalar(0));
1161 
1162   // TODO: Libcall support for s128.
1163   // TODO: s16 should be legal with full FP16 support.
1164   getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1165       .legalFor({{s64, s32}, {s64, s64}});
1166 
1167   // TODO: Custom legalization for vector types.
1168   // TODO: Custom legalization for mismatched types.
1169   // TODO: s16 support.
1170   getActionDefinitionsBuilder(G_FCOPYSIGN).customFor({{s32, s32}, {s64, s64}});
1171 
1172   getActionDefinitionsBuilder(G_FMAD).lower();
1173 
1174   // Access to floating-point environment.
1175   getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV, G_RESET_FPENV,
1176                                G_GET_FPMODE, G_SET_FPMODE, G_RESET_FPMODE})
1177       .libcall();
1178 
1179   getActionDefinitionsBuilder(G_IS_FPCLASS).lower();
1180 
1181   getActionDefinitionsBuilder(G_PREFETCH).custom();
1182 
1183   getLegacyLegalizerInfo().computeTables();
1184   verify(*ST.getInstrInfo());
1185 }
1186 
1187 bool AArch64LegalizerInfo::legalizeCustom(
1188     LegalizerHelper &Helper, MachineInstr &MI,
1189     LostDebugLocObserver &LocObserver) const {
1190   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1191   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
1192   GISelChangeObserver &Observer = Helper.Observer;
1193   switch (MI.getOpcode()) {
1194   default:
1195     // No idea what to do.
1196     return false;
1197   case TargetOpcode::G_VAARG:
1198     return legalizeVaArg(MI, MRI, MIRBuilder);
1199   case TargetOpcode::G_LOAD:
1200   case TargetOpcode::G_STORE:
1201     return legalizeLoadStore(MI, MRI, MIRBuilder, Observer);
1202   case TargetOpcode::G_SHL:
1203   case TargetOpcode::G_ASHR:
1204   case TargetOpcode::G_LSHR:
1205     return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer);
1206   case TargetOpcode::G_GLOBAL_VALUE:
1207     return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer);
1208   case TargetOpcode::G_SBFX:
1209   case TargetOpcode::G_UBFX:
1210     return legalizeBitfieldExtract(MI, MRI, Helper);
1211   case TargetOpcode::G_FSHL:
1212   case TargetOpcode::G_FSHR:
1213     return legalizeFunnelShift(MI, MRI, MIRBuilder, Observer, Helper);
1214   case TargetOpcode::G_ROTR:
1215     return legalizeRotate(MI, MRI, Helper);
1216   case TargetOpcode::G_CTPOP:
1217     return legalizeCTPOP(MI, MRI, Helper);
1218   case TargetOpcode::G_ATOMIC_CMPXCHG:
1219     return legalizeAtomicCmpxchg128(MI, MRI, Helper);
1220   case TargetOpcode::G_CTTZ:
1221     return legalizeCTTZ(MI, Helper);
1222   case TargetOpcode::G_BZERO:
1223   case TargetOpcode::G_MEMCPY:
1224   case TargetOpcode::G_MEMMOVE:
1225   case TargetOpcode::G_MEMSET:
1226     return legalizeMemOps(MI, Helper);
1227   case TargetOpcode::G_FCOPYSIGN:
1228     return legalizeFCopySign(MI, Helper);
1229   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1230     return legalizeExtractVectorElt(MI, MRI, Helper);
1231   case TargetOpcode::G_DYN_STACKALLOC:
1232     return legalizeDynStackAlloc(MI, Helper);
1233   case TargetOpcode::G_PREFETCH:
1234     return legalizePrefetch(MI, Helper);
1235   }
1236 
1237   llvm_unreachable("expected switch to return");
1238 }
1239 
1240 bool AArch64LegalizerInfo::legalizeFunnelShift(MachineInstr &MI,
1241                                                MachineRegisterInfo &MRI,
1242                                                MachineIRBuilder &MIRBuilder,
1243                                                GISelChangeObserver &Observer,
1244                                                LegalizerHelper &Helper) const {
1245   assert(MI.getOpcode() == TargetOpcode::G_FSHL ||
1246          MI.getOpcode() == TargetOpcode::G_FSHR);
1247 
1248   // Keep as G_FSHR if shift amount is a G_CONSTANT, else use generic
1249   // lowering
1250   Register ShiftNo = MI.getOperand(3).getReg();
1251   LLT ShiftTy = MRI.getType(ShiftNo);
1252   auto VRegAndVal = getIConstantVRegValWithLookThrough(ShiftNo, MRI);
1253 
1254   // Adjust shift amount according to Opcode (FSHL/FSHR)
1255   // Convert FSHL to FSHR
1256   LLT OperationTy = MRI.getType(MI.getOperand(0).getReg());
1257   APInt BitWidth(ShiftTy.getSizeInBits(), OperationTy.getSizeInBits(), false);
1258 
1259   // Lower non-constant shifts and leave zero shifts to the optimizer.
1260   if (!VRegAndVal || VRegAndVal->Value.urem(BitWidth) == 0)
1261     return (Helper.lowerFunnelShiftAsShifts(MI) ==
1262             LegalizerHelper::LegalizeResult::Legalized);
1263 
1264   APInt Amount = VRegAndVal->Value.urem(BitWidth);
1265 
1266   Amount = MI.getOpcode() == TargetOpcode::G_FSHL ? BitWidth - Amount : Amount;
1267 
1268   // If the instruction is G_FSHR, has a 64-bit G_CONSTANT for shift amount
1269   // in the range of 0 <-> BitWidth, it is legal
1270   if (ShiftTy.getSizeInBits() == 64 && MI.getOpcode() == TargetOpcode::G_FSHR &&
1271       VRegAndVal->Value.ult(BitWidth))
1272     return true;
1273 
1274   // Cast the ShiftNumber to a 64-bit type
1275   auto Cast64 = MIRBuilder.buildConstant(LLT::scalar(64), Amount.zext(64));
1276 
1277   if (MI.getOpcode() == TargetOpcode::G_FSHR) {
1278     Observer.changingInstr(MI);
1279     MI.getOperand(3).setReg(Cast64.getReg(0));
1280     Observer.changedInstr(MI);
1281   }
1282   // If Opcode is FSHL, remove the FSHL instruction and create a FSHR
1283   // instruction
1284   else if (MI.getOpcode() == TargetOpcode::G_FSHL) {
1285     MIRBuilder.buildInstr(TargetOpcode::G_FSHR, {MI.getOperand(0).getReg()},
1286                           {MI.getOperand(1).getReg(), MI.getOperand(2).getReg(),
1287                            Cast64.getReg(0)});
1288     MI.eraseFromParent();
1289   }
1290   return true;
1291 }
1292 
1293 bool AArch64LegalizerInfo::legalizeRotate(MachineInstr &MI,
1294                                           MachineRegisterInfo &MRI,
1295                                           LegalizerHelper &Helper) const {
1296   // To allow for imported patterns to match, we ensure that the rotate amount
1297   // is 64b with an extension.
1298   Register AmtReg = MI.getOperand(2).getReg();
1299   LLT AmtTy = MRI.getType(AmtReg);
1300   (void)AmtTy;
1301   assert(AmtTy.isScalar() && "Expected a scalar rotate");
1302   assert(AmtTy.getSizeInBits() < 64 && "Expected this rotate to be legal");
1303   auto NewAmt = Helper.MIRBuilder.buildZExt(LLT::scalar(64), AmtReg);
1304   Helper.Observer.changingInstr(MI);
1305   MI.getOperand(2).setReg(NewAmt.getReg(0));
1306   Helper.Observer.changedInstr(MI);
1307   return true;
1308 }
1309 
1310 bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(
1311     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
1312     GISelChangeObserver &Observer) const {
1313   assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE);
1314   // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP +
1315   // G_ADD_LOW instructions.
1316   // By splitting this here, we can optimize accesses in the small code model by
1317   // folding in the G_ADD_LOW into the load/store offset.
1318   auto &GlobalOp = MI.getOperand(1);
1319   const auto* GV = GlobalOp.getGlobal();
1320   if (GV->isThreadLocal())
1321     return true; // Don't want to modify TLS vars.
1322 
1323   auto &TM = ST->getTargetLowering()->getTargetMachine();
1324   unsigned OpFlags = ST->ClassifyGlobalReference(GV, TM);
1325 
1326   if (OpFlags & AArch64II::MO_GOT)
1327     return true;
1328 
1329   auto Offset = GlobalOp.getOffset();
1330   Register DstReg = MI.getOperand(0).getReg();
1331   auto ADRP = MIRBuilder.buildInstr(AArch64::ADRP, {LLT::pointer(0, 64)}, {})
1332                   .addGlobalAddress(GV, Offset, OpFlags | AArch64II::MO_PAGE);
1333   // Set the regclass on the dest reg too.
1334   MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
1335 
1336   // MO_TAGGED on the page indicates a tagged address. Set the tag now. We do so
1337   // by creating a MOVK that sets bits 48-63 of the register to (global address
1338   // + 0x100000000 - PC) >> 48. The additional 0x100000000 offset here is to
1339   // prevent an incorrect tag being generated during relocation when the
1340   // global appears before the code section. Without the offset, a global at
1341   // `0x0f00'0000'0000'1000` (i.e. at `0x1000` with tag `0xf`) that's referenced
1342   // by code at `0x2000` would result in `0x0f00'0000'0000'1000 - 0x2000 =
1343   // 0x0eff'ffff'ffff'f000`, meaning the tag would be incorrectly set to `0xe`
1344   // instead of `0xf`.
1345   // This assumes that we're in the small code model so we can assume a binary
1346   // size of <= 4GB, which makes the untagged PC relative offset positive. The
1347   // binary must also be loaded into address range [0, 2^48). Both of these
1348   // properties need to be ensured at runtime when using tagged addresses.
1349   if (OpFlags & AArch64II::MO_TAGGED) {
1350     assert(!Offset &&
1351            "Should not have folded in an offset for a tagged global!");
1352     ADRP = MIRBuilder.buildInstr(AArch64::MOVKXi, {LLT::pointer(0, 64)}, {ADRP})
1353                .addGlobalAddress(GV, 0x100000000,
1354                                  AArch64II::MO_PREL | AArch64II::MO_G3)
1355                .addImm(48);
1356     MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
1357   }
1358 
1359   MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP})
1360       .addGlobalAddress(GV, Offset,
1361                         OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
1362   MI.eraseFromParent();
1363   return true;
1364 }
1365 
1366 bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
1367                                              MachineInstr &MI) const {
1368   Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
1369   switch (IntrinsicID) {
1370   case Intrinsic::vacopy: {
1371     unsigned PtrSize = ST->isTargetILP32() ? 4 : 8;
1372     unsigned VaListSize =
1373       (ST->isTargetDarwin() || ST->isTargetWindows())
1374           ? PtrSize
1375           : ST->isTargetILP32() ? 20 : 32;
1376 
1377     MachineFunction &MF = *MI.getMF();
1378     auto Val = MF.getRegInfo().createGenericVirtualRegister(
1379         LLT::scalar(VaListSize * 8));
1380     MachineIRBuilder MIB(MI);
1381     MIB.buildLoad(Val, MI.getOperand(2),
1382                   *MF.getMachineMemOperand(MachinePointerInfo(),
1383                                            MachineMemOperand::MOLoad,
1384                                            VaListSize, Align(PtrSize)));
1385     MIB.buildStore(Val, MI.getOperand(1),
1386                    *MF.getMachineMemOperand(MachinePointerInfo(),
1387                                             MachineMemOperand::MOStore,
1388                                             VaListSize, Align(PtrSize)));
1389     MI.eraseFromParent();
1390     return true;
1391   }
1392   case Intrinsic::get_dynamic_area_offset: {
1393     MachineIRBuilder &MIB = Helper.MIRBuilder;
1394     MIB.buildConstant(MI.getOperand(0).getReg(), 0);
1395     MI.eraseFromParent();
1396     return true;
1397   }
1398   case Intrinsic::aarch64_mops_memset_tag: {
1399     assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
1400     // Anyext the value being set to 64 bit (only the bottom 8 bits are read by
1401     // the instruction).
1402     MachineIRBuilder MIB(MI);
1403     auto &Value = MI.getOperand(3);
1404     Register ExtValueReg = MIB.buildAnyExt(LLT::scalar(64), Value).getReg(0);
1405     Value.setReg(ExtValueReg);
1406     return true;
1407   }
1408   case Intrinsic::aarch64_prefetch: {
1409     MachineIRBuilder MIB(MI);
1410     auto &AddrVal = MI.getOperand(1);
1411 
1412     int64_t IsWrite = MI.getOperand(2).getImm();
1413     int64_t Target = MI.getOperand(3).getImm();
1414     int64_t IsStream = MI.getOperand(4).getImm();
1415     int64_t IsData = MI.getOperand(5).getImm();
1416 
1417     unsigned PrfOp = (IsWrite << 4) |    // Load/Store bit
1418                      (!IsData << 3) |    // IsDataCache bit
1419                      (Target << 1) |     // Cache level bits
1420                      (unsigned)IsStream; // Stream bit
1421 
1422     MIB.buildInstr(AArch64::G_AARCH64_PREFETCH).addImm(PrfOp).add(AddrVal);
1423     MI.eraseFromParent();
1424     return true;
1425   }
1426   case Intrinsic::aarch64_neon_uaddv:
1427   case Intrinsic::aarch64_neon_saddv:
1428   case Intrinsic::aarch64_neon_umaxv:
1429   case Intrinsic::aarch64_neon_smaxv:
1430   case Intrinsic::aarch64_neon_uminv:
1431   case Intrinsic::aarch64_neon_sminv: {
1432     MachineIRBuilder MIB(MI);
1433     MachineRegisterInfo &MRI = *MIB.getMRI();
1434     bool IsSigned = IntrinsicID == Intrinsic::aarch64_neon_saddv ||
1435                     IntrinsicID == Intrinsic::aarch64_neon_smaxv ||
1436                     IntrinsicID == Intrinsic::aarch64_neon_sminv;
1437 
1438     auto OldDst = MI.getOperand(0).getReg();
1439     auto OldDstTy = MRI.getType(OldDst);
1440     LLT NewDstTy = MRI.getType(MI.getOperand(2).getReg()).getElementType();
1441     if (OldDstTy == NewDstTy)
1442       return true;
1443 
1444     auto NewDst = MRI.createGenericVirtualRegister(NewDstTy);
1445 
1446     Helper.Observer.changingInstr(MI);
1447     MI.getOperand(0).setReg(NewDst);
1448     Helper.Observer.changedInstr(MI);
1449 
1450     MIB.setInsertPt(MIB.getMBB(), ++MIB.getInsertPt());
1451     MIB.buildExtOrTrunc(IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT,
1452                         OldDst, NewDst);
1453 
1454     return true;
1455   }
1456   case Intrinsic::aarch64_neon_uaddlp:
1457   case Intrinsic::aarch64_neon_saddlp: {
1458     MachineIRBuilder MIB(MI);
1459 
1460     unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlp
1461                        ? AArch64::G_UADDLP
1462                        : AArch64::G_SADDLP;
1463     MIB.buildInstr(Opc, {MI.getOperand(0)}, {MI.getOperand(2)});
1464     MI.eraseFromParent();
1465 
1466     return true;
1467   }
1468   case Intrinsic::aarch64_neon_uaddlv:
1469   case Intrinsic::aarch64_neon_saddlv: {
1470     MachineIRBuilder MIB(MI);
1471     MachineRegisterInfo &MRI = *MIB.getMRI();
1472 
1473     unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlv
1474                        ? AArch64::G_UADDLV
1475                        : AArch64::G_SADDLV;
1476     Register DstReg = MI.getOperand(0).getReg();
1477     Register SrcReg = MI.getOperand(2).getReg();
1478     LLT DstTy = MRI.getType(DstReg);
1479 
1480     LLT MidTy, ExtTy;
1481     if (DstTy.isScalar() && DstTy.getScalarSizeInBits() <= 32) {
1482       MidTy = LLT::fixed_vector(4, 32);
1483       ExtTy = LLT::scalar(32);
1484     } else {
1485       MidTy = LLT::fixed_vector(2, 64);
1486       ExtTy = LLT::scalar(64);
1487     }
1488 
1489     Register MidReg =
1490         MIB.buildInstr(Opc, {MidTy}, {SrcReg})->getOperand(0).getReg();
1491     Register ZeroReg =
1492         MIB.buildConstant(LLT::scalar(64), 0)->getOperand(0).getReg();
1493     Register ExtReg = MIB.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT, {ExtTy},
1494                                      {MidReg, ZeroReg})
1495                           .getReg(0);
1496 
1497     if (DstTy.getScalarSizeInBits() < 32)
1498       MIB.buildTrunc(DstReg, ExtReg);
1499     else
1500       MIB.buildCopy(DstReg, ExtReg);
1501 
1502     MI.eraseFromParent();
1503 
1504     return true;
1505   }
1506   case Intrinsic::aarch64_neon_smax:
1507   case Intrinsic::aarch64_neon_smin:
1508   case Intrinsic::aarch64_neon_umax:
1509   case Intrinsic::aarch64_neon_umin:
1510   case Intrinsic::aarch64_neon_fmax:
1511   case Intrinsic::aarch64_neon_fmin:
1512   case Intrinsic::aarch64_neon_fmaxnm:
1513   case Intrinsic::aarch64_neon_fminnm: {
1514     MachineIRBuilder MIB(MI);
1515     if (IntrinsicID == Intrinsic::aarch64_neon_smax)
1516       MIB.buildSMax(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3));
1517     else if (IntrinsicID == Intrinsic::aarch64_neon_smin)
1518       MIB.buildSMin(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3));
1519     else if (IntrinsicID == Intrinsic::aarch64_neon_umax)
1520       MIB.buildUMax(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3));
1521     else if (IntrinsicID == Intrinsic::aarch64_neon_umin)
1522       MIB.buildUMin(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3));
1523     else if (IntrinsicID == Intrinsic::aarch64_neon_fmax)
1524       MIB.buildInstr(TargetOpcode::G_FMAXIMUM, {MI.getOperand(0)},
1525                      {MI.getOperand(2), MI.getOperand(3)});
1526     else if (IntrinsicID == Intrinsic::aarch64_neon_fmin)
1527       MIB.buildInstr(TargetOpcode::G_FMINIMUM, {MI.getOperand(0)},
1528                      {MI.getOperand(2), MI.getOperand(3)});
1529     else if (IntrinsicID == Intrinsic::aarch64_neon_fmaxnm)
1530       MIB.buildInstr(TargetOpcode::G_FMAXNUM, {MI.getOperand(0)},
1531                      {MI.getOperand(2), MI.getOperand(3)});
1532     else if (IntrinsicID == Intrinsic::aarch64_neon_fminnm)
1533       MIB.buildInstr(TargetOpcode::G_FMINNUM, {MI.getOperand(0)},
1534                      {MI.getOperand(2), MI.getOperand(3)});
1535     MI.eraseFromParent();
1536     return true;
1537   }
1538   case Intrinsic::experimental_vector_reverse:
1539     // TODO: Add support for vector_reverse
1540     return false;
1541   }
1542 
1543   return true;
1544 }
1545 
1546 bool AArch64LegalizerInfo::legalizeShlAshrLshr(
1547     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
1548     GISelChangeObserver &Observer) const {
1549   assert(MI.getOpcode() == TargetOpcode::G_ASHR ||
1550          MI.getOpcode() == TargetOpcode::G_LSHR ||
1551          MI.getOpcode() == TargetOpcode::G_SHL);
1552   // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the
1553   // imported patterns can select it later. Either way, it will be legal.
1554   Register AmtReg = MI.getOperand(2).getReg();
1555   auto VRegAndVal = getIConstantVRegValWithLookThrough(AmtReg, MRI);
1556   if (!VRegAndVal)
1557     return true;
1558   // Check the shift amount is in range for an immediate form.
1559   int64_t Amount = VRegAndVal->Value.getSExtValue();
1560   if (Amount > 31)
1561     return true; // This will have to remain a register variant.
1562   auto ExtCst = MIRBuilder.buildConstant(LLT::scalar(64), Amount);
1563   Observer.changingInstr(MI);
1564   MI.getOperand(2).setReg(ExtCst.getReg(0));
1565   Observer.changedInstr(MI);
1566   return true;
1567 }
1568 
1569 static void matchLDPSTPAddrMode(Register Root, Register &Base, int &Offset,
1570                                 MachineRegisterInfo &MRI) {
1571   Base = Root;
1572   Offset = 0;
1573 
1574   Register NewBase;
1575   int64_t NewOffset;
1576   if (mi_match(Root, MRI, m_GPtrAdd(m_Reg(NewBase), m_ICst(NewOffset))) &&
1577       isShiftedInt<7, 3>(NewOffset)) {
1578     Base = NewBase;
1579     Offset = NewOffset;
1580   }
1581 }
1582 
1583 // FIXME: This should be removed and replaced with the generic bitcast legalize
1584 // action.
1585 bool AArch64LegalizerInfo::legalizeLoadStore(
1586     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
1587     GISelChangeObserver &Observer) const {
1588   assert(MI.getOpcode() == TargetOpcode::G_STORE ||
1589          MI.getOpcode() == TargetOpcode::G_LOAD);
1590   // Here we just try to handle vector loads/stores where our value type might
1591   // have pointer elements, which the SelectionDAG importer can't handle. To
1592   // allow the existing patterns for s64 to fire for p0, we just try to bitcast
1593   // the value to use s64 types.
1594 
1595   // Custom legalization requires the instruction, if not deleted, must be fully
1596   // legalized. In order to allow further legalization of the inst, we create
1597   // a new instruction and erase the existing one.
1598 
1599   Register ValReg = MI.getOperand(0).getReg();
1600   const LLT ValTy = MRI.getType(ValReg);
1601 
1602   if (ValTy == LLT::scalar(128)) {
1603 
1604     AtomicOrdering Ordering = (*MI.memoperands_begin())->getSuccessOrdering();
1605     bool IsLoad = MI.getOpcode() == TargetOpcode::G_LOAD;
1606     bool IsLoadAcquire = IsLoad && Ordering == AtomicOrdering::Acquire;
1607     bool IsStoreRelease = !IsLoad && Ordering == AtomicOrdering::Release;
1608     bool IsRcpC3 =
1609         ST->hasLSE2() && ST->hasRCPC3() && (IsLoadAcquire || IsStoreRelease);
1610 
1611     LLT s64 = LLT::scalar(64);
1612 
1613     unsigned Opcode;
1614     if (IsRcpC3) {
1615       Opcode = IsLoad ? AArch64::LDIAPPX : AArch64::STILPX;
1616     } else {
1617       // For LSE2, loads/stores should have been converted to monotonic and had
1618       // a fence inserted after them.
1619       assert(Ordering == AtomicOrdering::Monotonic ||
1620              Ordering == AtomicOrdering::Unordered);
1621       assert(ST->hasLSE2() && "ldp/stp not single copy atomic without +lse2");
1622 
1623       Opcode = IsLoad ? AArch64::LDPXi : AArch64::STPXi;
1624     }
1625 
1626     MachineInstrBuilder NewI;
1627     if (IsLoad) {
1628       NewI = MIRBuilder.buildInstr(Opcode, {s64, s64}, {});
1629       MIRBuilder.buildMergeLikeInstr(
1630           ValReg, {NewI->getOperand(0), NewI->getOperand(1)});
1631     } else {
1632       auto Split = MIRBuilder.buildUnmerge(s64, MI.getOperand(0));
1633       NewI = MIRBuilder.buildInstr(
1634           Opcode, {}, {Split->getOperand(0), Split->getOperand(1)});
1635     }
1636 
1637     if (IsRcpC3) {
1638       NewI.addUse(MI.getOperand(1).getReg());
1639     } else {
1640       Register Base;
1641       int Offset;
1642       matchLDPSTPAddrMode(MI.getOperand(1).getReg(), Base, Offset, MRI);
1643       NewI.addUse(Base);
1644       NewI.addImm(Offset / 8);
1645     }
1646 
1647     NewI.cloneMemRefs(MI);
1648     constrainSelectedInstRegOperands(*NewI, *ST->getInstrInfo(),
1649                                      *MRI.getTargetRegisterInfo(),
1650                                      *ST->getRegBankInfo());
1651     MI.eraseFromParent();
1652     return true;
1653   }
1654 
1655   if (!ValTy.isVector() || !ValTy.getElementType().isPointer() ||
1656       ValTy.getElementType().getAddressSpace() != 0) {
1657     LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store");
1658     return false;
1659   }
1660 
1661   unsigned PtrSize = ValTy.getElementType().getSizeInBits();
1662   const LLT NewTy = LLT::vector(ValTy.getElementCount(), PtrSize);
1663   auto &MMO = **MI.memoperands_begin();
1664   MMO.setType(NewTy);
1665 
1666   if (MI.getOpcode() == TargetOpcode::G_STORE) {
1667     auto Bitcast = MIRBuilder.buildBitcast(NewTy, ValReg);
1668     MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1), MMO);
1669   } else {
1670     auto NewLoad = MIRBuilder.buildLoad(NewTy, MI.getOperand(1), MMO);
1671     MIRBuilder.buildBitcast(ValReg, NewLoad);
1672   }
1673   MI.eraseFromParent();
1674   return true;
1675 }
1676 
1677 bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
1678                                          MachineRegisterInfo &MRI,
1679                                          MachineIRBuilder &MIRBuilder) const {
1680   MachineFunction &MF = MIRBuilder.getMF();
1681   Align Alignment(MI.getOperand(2).getImm());
1682   Register Dst = MI.getOperand(0).getReg();
1683   Register ListPtr = MI.getOperand(1).getReg();
1684 
1685   LLT PtrTy = MRI.getType(ListPtr);
1686   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1687 
1688   const unsigned PtrSize = PtrTy.getSizeInBits() / 8;
1689   const Align PtrAlign = Align(PtrSize);
1690   auto List = MIRBuilder.buildLoad(
1691       PtrTy, ListPtr,
1692       *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
1693                                PtrTy, PtrAlign));
1694 
1695   MachineInstrBuilder DstPtr;
1696   if (Alignment > PtrAlign) {
1697     // Realign the list to the actual required alignment.
1698     auto AlignMinus1 =
1699         MIRBuilder.buildConstant(IntPtrTy, Alignment.value() - 1);
1700     auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0));
1701     DstPtr = MIRBuilder.buildMaskLowPtrBits(PtrTy, ListTmp, Log2(Alignment));
1702   } else
1703     DstPtr = List;
1704 
1705   LLT ValTy = MRI.getType(Dst);
1706   uint64_t ValSize = ValTy.getSizeInBits() / 8;
1707   MIRBuilder.buildLoad(
1708       Dst, DstPtr,
1709       *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
1710                                ValTy, std::max(Alignment, PtrAlign)));
1711 
1712   auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrAlign));
1713 
1714   auto NewList = MIRBuilder.buildPtrAdd(PtrTy, DstPtr, Size.getReg(0));
1715 
1716   MIRBuilder.buildStore(NewList, ListPtr,
1717                         *MF.getMachineMemOperand(MachinePointerInfo(),
1718                                                  MachineMemOperand::MOStore,
1719                                                  PtrTy, PtrAlign));
1720 
1721   MI.eraseFromParent();
1722   return true;
1723 }
1724 
1725 bool AArch64LegalizerInfo::legalizeBitfieldExtract(
1726     MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
1727   // Only legal if we can select immediate forms.
1728   // TODO: Lower this otherwise.
1729   return getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI) &&
1730          getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
1731 }
1732 
1733 bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI,
1734                                          MachineRegisterInfo &MRI,
1735                                          LegalizerHelper &Helper) const {
1736   // When there is no integer popcount instruction (FEAT_CSSC isn't available),
1737   // it can be more efficiently lowered to the following sequence that uses
1738   // AdvSIMD registers/instructions as long as the copies to/from the AdvSIMD
1739   // registers are cheap.
1740   //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
1741   //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
1742   //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
1743   //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
1744   //
1745   // For 128 bit vector popcounts, we lower to the following sequence:
1746   //  cnt.16b   v0, v0  // v8s16, v4s32, v2s64
1747   //  uaddlp.8h v0, v0  // v8s16, v4s32, v2s64
1748   //  uaddlp.4s v0, v0  //        v4s32, v2s64
1749   //  uaddlp.2d v0, v0  //               v2s64
1750   //
1751   // For 64 bit vector popcounts, we lower to the following sequence:
1752   //  cnt.8b    v0, v0  // v4s16, v2s32
1753   //  uaddlp.4h v0, v0  // v4s16, v2s32
1754   //  uaddlp.2s v0, v0  //        v2s32
1755 
1756   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1757   Register Dst = MI.getOperand(0).getReg();
1758   Register Val = MI.getOperand(1).getReg();
1759   LLT Ty = MRI.getType(Val);
1760   unsigned Size = Ty.getSizeInBits();
1761 
1762   assert(Ty == MRI.getType(Dst) &&
1763          "Expected src and dst to have the same type!");
1764 
1765   if (ST->hasCSSC() && Ty.isScalar() && Size == 128) {
1766     LLT s64 = LLT::scalar(64);
1767 
1768     auto Split = MIRBuilder.buildUnmerge(s64, Val);
1769     auto CTPOP1 = MIRBuilder.buildCTPOP(s64, Split->getOperand(0));
1770     auto CTPOP2 = MIRBuilder.buildCTPOP(s64, Split->getOperand(1));
1771     auto Add = MIRBuilder.buildAdd(s64, CTPOP1, CTPOP2);
1772 
1773     MIRBuilder.buildZExt(Dst, Add);
1774     MI.eraseFromParent();
1775     return true;
1776   }
1777 
1778   if (!ST->hasNEON() ||
1779       MI.getMF()->getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) {
1780     // Use generic lowering when custom lowering is not possible.
1781     return Ty.isScalar() && (Size == 32 || Size == 64) &&
1782            Helper.lowerBitCount(MI) ==
1783                LegalizerHelper::LegalizeResult::Legalized;
1784   }
1785 
1786   // Pre-conditioning: widen Val up to the nearest vector type.
1787   // s32,s64,v4s16,v2s32 -> v8i8
1788   // v8s16,v4s32,v2s64 -> v16i8
1789   LLT VTy = Size == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8);
1790   if (Ty.isScalar()) {
1791     assert((Size == 32 || Size == 64 || Size == 128) && "Expected only 32, 64, or 128 bit scalars!");
1792     if (Size == 32) {
1793       Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0);
1794     }
1795   }
1796   Val = MIRBuilder.buildBitcast(VTy, Val).getReg(0);
1797 
1798   // Count bits in each byte-sized lane.
1799   auto CTPOP = MIRBuilder.buildCTPOP(VTy, Val);
1800 
1801   // Sum across lanes.
1802   Register HSum = CTPOP.getReg(0);
1803   unsigned Opc;
1804   SmallVector<LLT> HAddTys;
1805   if (Ty.isScalar()) {
1806     Opc = Intrinsic::aarch64_neon_uaddlv;
1807     HAddTys.push_back(LLT::scalar(32));
1808   } else if (Ty == LLT::fixed_vector(8, 16)) {
1809     Opc = Intrinsic::aarch64_neon_uaddlp;
1810     HAddTys.push_back(LLT::fixed_vector(8, 16));
1811   } else if (Ty == LLT::fixed_vector(4, 32)) {
1812     Opc = Intrinsic::aarch64_neon_uaddlp;
1813     HAddTys.push_back(LLT::fixed_vector(8, 16));
1814     HAddTys.push_back(LLT::fixed_vector(4, 32));
1815   } else if (Ty == LLT::fixed_vector(2, 64)) {
1816     Opc = Intrinsic::aarch64_neon_uaddlp;
1817     HAddTys.push_back(LLT::fixed_vector(8, 16));
1818     HAddTys.push_back(LLT::fixed_vector(4, 32));
1819     HAddTys.push_back(LLT::fixed_vector(2, 64));
1820   } else if (Ty == LLT::fixed_vector(4, 16)) {
1821     Opc = Intrinsic::aarch64_neon_uaddlp;
1822     HAddTys.push_back(LLT::fixed_vector(4, 16));
1823   } else if (Ty == LLT::fixed_vector(2, 32)) {
1824     Opc = Intrinsic::aarch64_neon_uaddlp;
1825     HAddTys.push_back(LLT::fixed_vector(4, 16));
1826     HAddTys.push_back(LLT::fixed_vector(2, 32));
1827   } else
1828     llvm_unreachable("unexpected vector shape");
1829   MachineInstrBuilder UADD;
1830   for (LLT HTy : HAddTys) {
1831     UADD = MIRBuilder.buildIntrinsic(Opc, {HTy}).addUse(HSum);
1832     HSum = UADD.getReg(0);
1833   }
1834 
1835   // Post-conditioning.
1836   if (Ty.isScalar() && (Size == 64 || Size == 128))
1837     MIRBuilder.buildZExt(Dst, UADD);
1838   else
1839     UADD->getOperand(0).setReg(Dst);
1840   MI.eraseFromParent();
1841   return true;
1842 }
1843 
1844 bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128(
1845     MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
1846   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1847   LLT s64 = LLT::scalar(64);
1848   auto Addr = MI.getOperand(1).getReg();
1849   auto DesiredI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(2));
1850   auto NewI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(3));
1851   auto DstLo = MRI.createGenericVirtualRegister(s64);
1852   auto DstHi = MRI.createGenericVirtualRegister(s64);
1853 
1854   MachineInstrBuilder CAS;
1855   if (ST->hasLSE()) {
1856     // We have 128-bit CASP instructions taking XSeqPair registers, which are
1857     // s128. We need the merge/unmerge to bracket the expansion and pair up with
1858     // the rest of the MIR so we must reassemble the extracted registers into a
1859     // 128-bit known-regclass one with code like this:
1860     //
1861     //     %in1 = REG_SEQUENCE Lo, Hi    ; One for each input
1862     //     %out = CASP %in1, ...
1863     //     %OldLo = G_EXTRACT %out, 0
1864     //     %OldHi = G_EXTRACT %out, 64
1865     auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
1866     unsigned Opcode;
1867     switch (Ordering) {
1868     case AtomicOrdering::Acquire:
1869       Opcode = AArch64::CASPAX;
1870       break;
1871     case AtomicOrdering::Release:
1872       Opcode = AArch64::CASPLX;
1873       break;
1874     case AtomicOrdering::AcquireRelease:
1875     case AtomicOrdering::SequentiallyConsistent:
1876       Opcode = AArch64::CASPALX;
1877       break;
1878     default:
1879       Opcode = AArch64::CASPX;
1880       break;
1881     }
1882 
1883     LLT s128 = LLT::scalar(128);
1884     auto CASDst = MRI.createGenericVirtualRegister(s128);
1885     auto CASDesired = MRI.createGenericVirtualRegister(s128);
1886     auto CASNew = MRI.createGenericVirtualRegister(s128);
1887     MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASDesired}, {})
1888         .addUse(DesiredI->getOperand(0).getReg())
1889         .addImm(AArch64::sube64)
1890         .addUse(DesiredI->getOperand(1).getReg())
1891         .addImm(AArch64::subo64);
1892     MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASNew}, {})
1893         .addUse(NewI->getOperand(0).getReg())
1894         .addImm(AArch64::sube64)
1895         .addUse(NewI->getOperand(1).getReg())
1896         .addImm(AArch64::subo64);
1897 
1898     CAS = MIRBuilder.buildInstr(Opcode, {CASDst}, {CASDesired, CASNew, Addr});
1899 
1900     MIRBuilder.buildExtract({DstLo}, {CASDst}, 0);
1901     MIRBuilder.buildExtract({DstHi}, {CASDst}, 64);
1902   } else {
1903     // The -O0 CMP_SWAP_128 is friendlier to generate code for because LDXP/STXP
1904     // can take arbitrary registers so it just has the normal GPR64 operands the
1905     // rest of AArch64 is expecting.
1906     auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
1907     unsigned Opcode;
1908     switch (Ordering) {
1909     case AtomicOrdering::Acquire:
1910       Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
1911       break;
1912     case AtomicOrdering::Release:
1913       Opcode = AArch64::CMP_SWAP_128_RELEASE;
1914       break;
1915     case AtomicOrdering::AcquireRelease:
1916     case AtomicOrdering::SequentiallyConsistent:
1917       Opcode = AArch64::CMP_SWAP_128;
1918       break;
1919     default:
1920       Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
1921       break;
1922     }
1923 
1924     auto Scratch = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
1925     CAS = MIRBuilder.buildInstr(Opcode, {DstLo, DstHi, Scratch},
1926                                 {Addr, DesiredI->getOperand(0),
1927                                  DesiredI->getOperand(1), NewI->getOperand(0),
1928                                  NewI->getOperand(1)});
1929   }
1930 
1931   CAS.cloneMemRefs(MI);
1932   constrainSelectedInstRegOperands(*CAS, *ST->getInstrInfo(),
1933                                    *MRI.getTargetRegisterInfo(),
1934                                    *ST->getRegBankInfo());
1935 
1936   MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {DstLo, DstHi});
1937   MI.eraseFromParent();
1938   return true;
1939 }
1940 
1941 bool AArch64LegalizerInfo::legalizeCTTZ(MachineInstr &MI,
1942                                         LegalizerHelper &Helper) const {
1943   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1944   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
1945   LLT Ty = MRI.getType(MI.getOperand(1).getReg());
1946   auto BitReverse = MIRBuilder.buildBitReverse(Ty, MI.getOperand(1));
1947   MIRBuilder.buildCTLZ(MI.getOperand(0).getReg(), BitReverse);
1948   MI.eraseFromParent();
1949   return true;
1950 }
1951 
1952 bool AArch64LegalizerInfo::legalizeMemOps(MachineInstr &MI,
1953                                           LegalizerHelper &Helper) const {
1954   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1955 
1956   // Tagged version MOPSMemorySetTagged is legalised in legalizeIntrinsic
1957   if (MI.getOpcode() == TargetOpcode::G_MEMSET) {
1958     // Anyext the value being set to 64 bit (only the bottom 8 bits are read by
1959     // the instruction).
1960     auto &Value = MI.getOperand(1);
1961     Register ExtValueReg =
1962         MIRBuilder.buildAnyExt(LLT::scalar(64), Value).getReg(0);
1963     Value.setReg(ExtValueReg);
1964     return true;
1965   }
1966 
1967   return false;
1968 }
1969 
1970 bool AArch64LegalizerInfo::legalizeFCopySign(MachineInstr &MI,
1971                                              LegalizerHelper &Helper) const {
1972   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1973   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
1974   Register Dst = MI.getOperand(0).getReg();
1975   LLT DstTy = MRI.getType(Dst);
1976   assert(DstTy.isScalar() && "Only expected scalars right now!");
1977   const unsigned DstSize = DstTy.getSizeInBits();
1978   assert((DstSize == 32 || DstSize == 64) && "Unexpected dst type!");
1979   assert(MRI.getType(MI.getOperand(2).getReg()) == DstTy &&
1980          "Expected homogeneous types!");
1981 
1982   // We want to materialize a mask with the high bit set.
1983   uint64_t EltMask;
1984   LLT VecTy;
1985 
1986   // TODO: s16 support.
1987   switch (DstSize) {
1988   default:
1989     llvm_unreachable("Unexpected type for G_FCOPYSIGN!");
1990   case 64: {
1991     // AdvSIMD immediate moves cannot materialize out mask in a single
1992     // instruction for 64-bit elements. Instead, materialize zero and then
1993     // negate it.
1994     EltMask = 0;
1995     VecTy = LLT::fixed_vector(2, DstTy);
1996     break;
1997   }
1998   case 32:
1999     EltMask = 0x80000000ULL;
2000     VecTy = LLT::fixed_vector(4, DstTy);
2001     break;
2002   }
2003 
2004   // Widen In1 and In2 to 128 bits. We want these to eventually become
2005   // INSERT_SUBREGs.
2006   auto Undef = MIRBuilder.buildUndef(VecTy);
2007   auto Zero = MIRBuilder.buildConstant(DstTy, 0);
2008   auto Ins1 = MIRBuilder.buildInsertVectorElement(
2009       VecTy, Undef, MI.getOperand(1).getReg(), Zero);
2010   auto Ins2 = MIRBuilder.buildInsertVectorElement(
2011       VecTy, Undef, MI.getOperand(2).getReg(), Zero);
2012 
2013   // Construct the mask.
2014   auto Mask = MIRBuilder.buildConstant(VecTy, EltMask);
2015   if (DstSize == 64)
2016     Mask = MIRBuilder.buildFNeg(VecTy, Mask);
2017 
2018   auto Sel = MIRBuilder.buildInstr(AArch64::G_BSP, {VecTy}, {Mask, Ins2, Ins1});
2019 
2020   // Build an unmerge whose 0th elt is the original G_FCOPYSIGN destination. We
2021   // want this to eventually become an EXTRACT_SUBREG.
2022   SmallVector<Register, 2> DstRegs(1, Dst);
2023   for (unsigned I = 1, E = VecTy.getNumElements(); I < E; ++I)
2024     DstRegs.push_back(MRI.createGenericVirtualRegister(DstTy));
2025   MIRBuilder.buildUnmerge(DstRegs, Sel);
2026   MI.eraseFromParent();
2027   return true;
2028 }
2029 
2030 bool AArch64LegalizerInfo::legalizeExtractVectorElt(
2031     MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
2032   assert(MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT);
2033   auto VRegAndVal =
2034       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2035   if (VRegAndVal)
2036     return true;
2037   return Helper.lowerExtractInsertVectorElt(MI) !=
2038          LegalizerHelper::LegalizeResult::UnableToLegalize;
2039 }
2040 
2041 bool AArch64LegalizerInfo::legalizeDynStackAlloc(
2042     MachineInstr &MI, LegalizerHelper &Helper) const {
2043   MachineFunction &MF = *MI.getParent()->getParent();
2044   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2045   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
2046 
2047   // If stack probing is not enabled for this function, use the default
2048   // lowering.
2049   if (!MF.getFunction().hasFnAttribute("probe-stack") ||
2050       MF.getFunction().getFnAttribute("probe-stack").getValueAsString() !=
2051           "inline-asm") {
2052     Helper.lowerDynStackAlloc(MI);
2053     return true;
2054   }
2055 
2056   Register Dst = MI.getOperand(0).getReg();
2057   Register AllocSize = MI.getOperand(1).getReg();
2058   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
2059 
2060   assert(MRI.getType(Dst) == LLT::pointer(0, 64) &&
2061          "Unexpected type for dynamic alloca");
2062   assert(MRI.getType(AllocSize) == LLT::scalar(64) &&
2063          "Unexpected type for dynamic alloca");
2064 
2065   LLT PtrTy = MRI.getType(Dst);
2066   Register SPReg =
2067       Helper.getTargetLowering().getStackPointerRegisterToSaveRestore();
2068   Register SPTmp =
2069       Helper.getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy);
2070   auto NewMI =
2071       MIRBuilder.buildInstr(AArch64::PROBED_STACKALLOC_DYN, {}, {SPTmp});
2072   MRI.setRegClass(NewMI.getReg(0), &AArch64::GPR64commonRegClass);
2073   MIRBuilder.setInsertPt(*NewMI->getParent(), NewMI);
2074   MIRBuilder.buildCopy(Dst, SPTmp);
2075 
2076   MI.eraseFromParent();
2077   return true;
2078 }
2079 
2080 bool AArch64LegalizerInfo::legalizePrefetch(MachineInstr &MI,
2081                                             LegalizerHelper &Helper) const {
2082   MachineIRBuilder &MIB = Helper.MIRBuilder;
2083   auto &AddrVal = MI.getOperand(0);
2084 
2085   int64_t IsWrite = MI.getOperand(1).getImm();
2086   int64_t Locality = MI.getOperand(2).getImm();
2087   int64_t IsData = MI.getOperand(3).getImm();
2088 
2089   bool IsStream = Locality == 0;
2090   if (Locality != 0) {
2091     assert(Locality <= 3 && "Prefetch locality out-of-range");
2092     // The locality degree is the opposite of the cache speed.
2093     // Put the number the other way around.
2094     // The encoding starts at 0 for level 1
2095     Locality = 3 - Locality;
2096   }
2097 
2098   unsigned PrfOp = (IsWrite << 4) | (!IsData << 3) | (Locality << 1) | IsStream;
2099 
2100   MIB.buildInstr(AArch64::G_AARCH64_PREFETCH).addImm(PrfOp).add(AddrVal);
2101   MI.eraseFromParent();
2102   return true;
2103 }
2104