xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp (revision b9128a37faafede823eb456aa65a11ac69997284)
1 //===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AArch64.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AArch64LegalizerInfo.h"
15 #include "AArch64RegisterBankInfo.h"
16 #include "AArch64Subtarget.h"
17 #include "llvm/ADT/STLExtras.h"
18 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
19 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
20 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
21 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/Utils.h"
24 #include "llvm/CodeGen/MachineInstr.h"
25 #include "llvm/CodeGen/MachineRegisterInfo.h"
26 #include "llvm/CodeGen/TargetOpcodes.h"
27 #include "llvm/CodeGen/ValueTypes.h"
28 #include "llvm/IR/DerivedTypes.h"
29 #include "llvm/IR/Intrinsics.h"
30 #include "llvm/IR/IntrinsicsAArch64.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/MathExtras.h"
33 #include <initializer_list>
34 
35 #define DEBUG_TYPE "aarch64-legalinfo"
36 
37 using namespace llvm;
38 using namespace LegalizeActions;
39 using namespace LegalizeMutations;
40 using namespace LegalityPredicates;
41 using namespace MIPatternMatch;
42 
43 AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
44     : ST(&ST) {
45   using namespace TargetOpcode;
46   const LLT p0 = LLT::pointer(0, 64);
47   const LLT s8 = LLT::scalar(8);
48   const LLT s16 = LLT::scalar(16);
49   const LLT s32 = LLT::scalar(32);
50   const LLT s64 = LLT::scalar(64);
51   const LLT s128 = LLT::scalar(128);
52   const LLT v16s8 = LLT::fixed_vector(16, 8);
53   const LLT v8s8 = LLT::fixed_vector(8, 8);
54   const LLT v4s8 = LLT::fixed_vector(4, 8);
55   const LLT v8s16 = LLT::fixed_vector(8, 16);
56   const LLT v4s16 = LLT::fixed_vector(4, 16);
57   const LLT v2s16 = LLT::fixed_vector(2, 16);
58   const LLT v2s32 = LLT::fixed_vector(2, 32);
59   const LLT v4s32 = LLT::fixed_vector(4, 32);
60   const LLT v2s64 = LLT::fixed_vector(2, 64);
61   const LLT v2p0 = LLT::fixed_vector(2, p0);
62 
63   std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */
64                                                         v16s8, v8s16, v4s32,
65                                                         v2s64, v2p0,
66                                                         /* End 128bit types */
67                                                         /* Begin 64bit types */
68                                                         v8s8, v4s16, v2s32};
69   std::initializer_list<LLT> ScalarAndPtrTypesList = {s8, s16, s32, s64, p0};
70   SmallVector<LLT, 8> PackedVectorAllTypesVec(PackedVectorAllTypeList);
71   SmallVector<LLT, 8> ScalarAndPtrTypesVec(ScalarAndPtrTypesList);
72 
73   const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine();
74 
75   // FIXME: support subtargets which have neon/fp-armv8 disabled.
76   if (!ST.hasNEON() || !ST.hasFPARMv8()) {
77     getLegacyLegalizerInfo().computeTables();
78     return;
79   }
80 
81   // Some instructions only support s16 if the subtarget has full 16-bit FP
82   // support.
83   const bool HasFP16 = ST.hasFullFP16();
84   const LLT &MinFPScalar = HasFP16 ? s16 : s32;
85 
86   const bool HasCSSC = ST.hasCSSC();
87   const bool HasRCPC3 = ST.hasRCPC3();
88 
89   getActionDefinitionsBuilder(
90       {G_IMPLICIT_DEF, G_FREEZE, G_CONSTANT_FOLD_BARRIER})
91       .legalFor({p0, s8, s16, s32, s64})
92       .legalFor(PackedVectorAllTypeList)
93       .widenScalarToNextPow2(0)
94       .clampScalar(0, s8, s64)
95       .fewerElementsIf(
96           [=](const LegalityQuery &Query) {
97             return Query.Types[0].isVector() &&
98                    (Query.Types[0].getElementType() != s64 ||
99                     Query.Types[0].getNumElements() != 2);
100           },
101           [=](const LegalityQuery &Query) {
102             LLT EltTy = Query.Types[0].getElementType();
103             if (EltTy == s64)
104               return std::make_pair(0, LLT::fixed_vector(2, 64));
105             return std::make_pair(0, EltTy);
106           });
107 
108   getActionDefinitionsBuilder(G_PHI)
109       .legalFor({p0, s16, s32, s64})
110       .legalFor(PackedVectorAllTypeList)
111       .widenScalarToNextPow2(0)
112       .clampScalar(0, s16, s64)
113       // Maximum: sN * k = 128
114       .clampMaxNumElements(0, s8, 16)
115       .clampMaxNumElements(0, s16, 8)
116       .clampMaxNumElements(0, s32, 4)
117       .clampMaxNumElements(0, s64, 2)
118       .clampMaxNumElements(0, p0, 2);
119 
120   getActionDefinitionsBuilder(G_BSWAP)
121       .legalFor({s32, s64, v4s32, v2s32, v2s64})
122       .widenScalarToNextPow2(0)
123       .clampScalar(0, s32, s64);
124 
125   getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
126       .legalFor({s32, s64, v2s32, v2s64, v4s32, v4s16, v8s16, v16s8, v8s8})
127       .widenScalarToNextPow2(0)
128       .clampScalar(0, s32, s64)
129       .clampMaxNumElements(0, s8, 16)
130       .clampMaxNumElements(0, s16, 8)
131       .clampNumElements(0, v2s32, v4s32)
132       .clampNumElements(0, v2s64, v2s64)
133       .minScalarOrEltIf(
134           [=](const LegalityQuery &Query) {
135             return Query.Types[0].getNumElements() <= 2;
136           },
137           0, s32)
138       .minScalarOrEltIf(
139           [=](const LegalityQuery &Query) {
140             return Query.Types[0].getNumElements() <= 4;
141           },
142           0, s16)
143       .minScalarOrEltIf(
144           [=](const LegalityQuery &Query) {
145             return Query.Types[0].getNumElements() <= 16;
146           },
147           0, s8)
148       .moreElementsToNextPow2(0);
149 
150   getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR})
151       .customIf([=](const LegalityQuery &Query) {
152         const auto &SrcTy = Query.Types[0];
153         const auto &AmtTy = Query.Types[1];
154         return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
155                AmtTy.getSizeInBits() == 32;
156       })
157       .legalFor({
158           {s32, s32},
159           {s32, s64},
160           {s64, s64},
161           {v8s8, v8s8},
162           {v16s8, v16s8},
163           {v4s16, v4s16},
164           {v8s16, v8s16},
165           {v2s32, v2s32},
166           {v4s32, v4s32},
167           {v2s64, v2s64},
168       })
169       .widenScalarToNextPow2(0)
170       .clampScalar(1, s32, s64)
171       .clampScalar(0, s32, s64)
172       .clampNumElements(0, v8s8, v16s8)
173       .clampNumElements(0, v4s16, v8s16)
174       .clampNumElements(0, v2s32, v4s32)
175       .clampNumElements(0, v2s64, v2s64)
176       .moreElementsToNextPow2(0)
177       .minScalarSameAs(1, 0);
178 
179   getActionDefinitionsBuilder(G_PTR_ADD)
180       .legalFor({{p0, s64}, {v2p0, v2s64}})
181       .clampScalar(1, s64, s64);
182 
183   getActionDefinitionsBuilder(G_PTRMASK).legalFor({{p0, s64}});
184 
185   getActionDefinitionsBuilder({G_SDIV, G_UDIV})
186       .legalFor({s32, s64})
187       .libcallFor({s128})
188       .clampScalar(0, s32, s64)
189       .widenScalarToNextPow2(0)
190       .scalarize(0);
191 
192   getActionDefinitionsBuilder({G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
193       .lowerFor({s8, s16, s32, s64, v2s64, v4s32, v2s32})
194       .widenScalarOrEltToNextPow2(0)
195       .clampScalarOrElt(0, s32, s64)
196       .clampNumElements(0, v2s32, v4s32)
197       .clampNumElements(0, v2s64, v2s64)
198       .moreElementsToNextPow2(0);
199 
200 
201   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
202       .widenScalarToNextPow2(0, /*Min = */ 32)
203       .clampScalar(0, s32, s64)
204       .lower();
205 
206   getActionDefinitionsBuilder({G_SMULH, G_UMULH})
207       .legalFor({s64, v8s16, v16s8, v4s32})
208       .lower();
209 
210   auto &MinMaxActions = getActionDefinitionsBuilder(
211       {G_SMIN, G_SMAX, G_UMIN, G_UMAX});
212   if (HasCSSC)
213     MinMaxActions
214         .legalFor({s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
215         // Making clamping conditional on CSSC extension as without legal types we
216         // lower to CMP which can fold one of the two sxtb's we'd otherwise need
217         // if we detect a type smaller than 32-bit.
218         .minScalar(0, s32);
219   else
220     MinMaxActions
221         .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32});
222   MinMaxActions
223       .clampNumElements(0, v8s8, v16s8)
224       .clampNumElements(0, v4s16, v8s16)
225       .clampNumElements(0, v2s32, v4s32)
226       // FIXME: This sholdn't be needed as v2s64 types are going to
227       // be expanded anyway, but G_ICMP doesn't support splitting vectors yet
228       .clampNumElements(0, v2s64, v2s64)
229       .lower();
230 
231   getActionDefinitionsBuilder(
232       {G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO})
233       .legalFor({{s32, s32}, {s64, s32}})
234       .clampScalar(0, s32, s64)
235        .clampScalar(1, s32, s64)
236       .widenScalarToNextPow2(0);
237 
238   getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FMA, G_FNEG,
239                                G_FABS, G_FSQRT, G_FMAXNUM, G_FMINNUM,
240                                G_FMAXIMUM, G_FMINIMUM, G_FCEIL, G_FFLOOR,
241                                G_FRINT, G_FNEARBYINT, G_INTRINSIC_TRUNC,
242                                G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
243       .legalFor({MinFPScalar, s32, s64, v2s32, v4s32, v2s64})
244       .legalIf([=](const LegalityQuery &Query) {
245         const auto &Ty = Query.Types[0];
246         return (Ty == v8s16 || Ty == v4s16) && HasFP16;
247       })
248       .libcallFor({s128})
249       .minScalarOrElt(0, MinFPScalar)
250       .clampNumElements(0, v4s16, v8s16)
251       .clampNumElements(0, v2s32, v4s32)
252       .clampNumElements(0, v2s64, v2s64)
253       .moreElementsToNextPow2(0);
254 
255   getActionDefinitionsBuilder(G_FREM)
256       .libcallFor({s32, s64})
257       .minScalar(0, s32)
258       .scalarize(0);
259 
260   getActionDefinitionsBuilder(G_INTRINSIC_LRINT)
261       // If we don't have full FP16 support, then scalarize the elements of
262       // vectors containing fp16 types.
263       .fewerElementsIf(
264           [=, &ST](const LegalityQuery &Query) {
265             const auto &Ty = Query.Types[0];
266             return Ty.isVector() && Ty.getElementType() == s16 &&
267                    !ST.hasFullFP16();
268           },
269           [=](const LegalityQuery &Query) { return std::make_pair(0, s16); })
270       // If we don't have full FP16 support, then widen s16 to s32 if we
271       // encounter it.
272       .widenScalarIf(
273           [=, &ST](const LegalityQuery &Query) {
274             return Query.Types[0] == s16 && !ST.hasFullFP16();
275           },
276           [=](const LegalityQuery &Query) { return std::make_pair(0, s32); })
277       .legalFor({s16, s32, s64, v2s32, v4s32, v2s64, v2s16, v4s16, v8s16});
278 
279   getActionDefinitionsBuilder(
280       {G_FCOS, G_FSIN, G_FPOW, G_FLOG, G_FLOG2, G_FLOG10,
281        G_FEXP, G_FEXP2, G_FEXP10})
282       // We need a call for these, so we always need to scalarize.
283       .scalarize(0)
284       // Regardless of FP16 support, widen 16-bit elements to 32-bits.
285       .minScalar(0, s32)
286       .libcallFor({s32, s64});
287   getActionDefinitionsBuilder(G_FPOWI)
288       .scalarize(0)
289       .minScalar(0, s32)
290       .libcallFor({{s32, s32}, {s64, s32}});
291 
292   getActionDefinitionsBuilder(G_INSERT)
293       .legalIf(all(typeInSet(0, {s32, s64, p0}),
294                    typeInSet(1, {s8, s16, s32}), smallerThan(1, 0)))
295       .widenScalarToNextPow2(0)
296       .clampScalar(0, s32, s64)
297       .widenScalarToNextPow2(1)
298       .minScalar(1, s8)
299       .maxScalarIf(typeInSet(0, {s32}), 1, s16)
300       .maxScalarIf(typeInSet(0, {s64, p0}), 1, s32);
301 
302   getActionDefinitionsBuilder(G_EXTRACT)
303       .legalIf(all(typeInSet(0, {s16, s32, s64, p0}),
304                    typeInSet(1, {s32, s64, s128, p0}), smallerThan(0, 1)))
305       .widenScalarToNextPow2(1)
306       .clampScalar(1, s32, s128)
307       .widenScalarToNextPow2(0)
308       .minScalar(0, s16)
309       .maxScalarIf(typeInSet(1, {s32}), 0, s16)
310       .maxScalarIf(typeInSet(1, {s64, p0}), 0, s32)
311       .maxScalarIf(typeInSet(1, {s128}), 0, s64);
312 
313 
314   for (unsigned Op : {G_SEXTLOAD, G_ZEXTLOAD}) {
315     auto &Actions =  getActionDefinitionsBuilder(Op);
316 
317     if (Op == G_SEXTLOAD)
318       Actions.lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered));
319 
320     // Atomics have zero extending behavior.
321     Actions
322       .legalForTypesWithMemDesc({{s32, p0, s8, 8},
323                                  {s32, p0, s16, 8},
324                                  {s32, p0, s32, 8},
325                                  {s64, p0, s8, 2},
326                                  {s64, p0, s16, 2},
327                                  {s64, p0, s32, 4},
328                                  {s64, p0, s64, 8},
329                                  {p0, p0, s64, 8},
330                                  {v2s32, p0, s64, 8}})
331       .widenScalarToNextPow2(0)
332       .clampScalar(0, s32, s64)
333       // TODO: We could support sum-of-pow2's but the lowering code doesn't know
334       //       how to do that yet.
335       .unsupportedIfMemSizeNotPow2()
336       // Lower anything left over into G_*EXT and G_LOAD
337       .lower();
338   }
339 
340   auto IsPtrVecPred = [=](const LegalityQuery &Query) {
341     const LLT &ValTy = Query.Types[0];
342     if (!ValTy.isVector())
343       return false;
344     const LLT EltTy = ValTy.getElementType();
345     return EltTy.isPointer() && EltTy.getAddressSpace() == 0;
346   };
347 
348   getActionDefinitionsBuilder(G_LOAD)
349       .customIf([=](const LegalityQuery &Query) {
350         return HasRCPC3 && Query.Types[0] == s128 &&
351                Query.MMODescrs[0].Ordering == AtomicOrdering::Acquire;
352       })
353       .customIf([=](const LegalityQuery &Query) {
354         return Query.Types[0] == s128 &&
355                Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
356       })
357       .legalForTypesWithMemDesc({{s8, p0, s8, 8},
358                                  {s16, p0, s16, 8},
359                                  {s32, p0, s32, 8},
360                                  {s64, p0, s64, 8},
361                                  {p0, p0, s64, 8},
362                                  {s128, p0, s128, 8},
363                                  {v8s8, p0, s64, 8},
364                                  {v16s8, p0, s128, 8},
365                                  {v4s16, p0, s64, 8},
366                                  {v8s16, p0, s128, 8},
367                                  {v2s32, p0, s64, 8},
368                                  {v4s32, p0, s128, 8},
369                                  {v2s64, p0, s128, 8}})
370       // These extends are also legal
371       .legalForTypesWithMemDesc(
372           {{s32, p0, s8, 8}, {s32, p0, s16, 8}, {s64, p0, s32, 8}})
373       .widenScalarToNextPow2(0, /* MinSize = */ 8)
374       .lowerIfMemSizeNotByteSizePow2()
375       .clampScalar(0, s8, s64)
376       .narrowScalarIf(
377           [=](const LegalityQuery &Query) {
378             // Clamp extending load results to 32-bits.
379             return Query.Types[0].isScalar() &&
380                    Query.Types[0] != Query.MMODescrs[0].MemoryTy &&
381                    Query.Types[0].getSizeInBits() > 32;
382           },
383           changeTo(0, s32))
384       .clampMaxNumElements(0, s8, 16)
385       .clampMaxNumElements(0, s16, 8)
386       .clampMaxNumElements(0, s32, 4)
387       .clampMaxNumElements(0, s64, 2)
388       .clampMaxNumElements(0, p0, 2)
389       .customIf(IsPtrVecPred)
390       .scalarizeIf(typeIs(0, v2s16), 0);
391 
392   getActionDefinitionsBuilder(G_STORE)
393       .customIf([=](const LegalityQuery &Query) {
394         return HasRCPC3 && Query.Types[0] == s128 &&
395                Query.MMODescrs[0].Ordering == AtomicOrdering::Release;
396       })
397       .customIf([=](const LegalityQuery &Query) {
398         return Query.Types[0] == s128 &&
399                Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
400       })
401       .legalForTypesWithMemDesc(
402           {{s8, p0, s8, 8},     {s16, p0, s8, 8},  // truncstorei8 from s16
403            {s32, p0, s8, 8},                       // truncstorei8 from s32
404            {s64, p0, s8, 8},                       // truncstorei8 from s64
405            {s16, p0, s16, 8},   {s32, p0, s16, 8}, // truncstorei16 from s32
406            {s64, p0, s16, 8},                      // truncstorei16 from s64
407            {s32, p0, s8, 8},    {s32, p0, s16, 8},    {s32, p0, s32, 8},
408            {s64, p0, s64, 8},   {s64, p0, s32, 8}, // truncstorei32 from s64
409            {p0, p0, s64, 8},    {s128, p0, s128, 8},  {v16s8, p0, s128, 8},
410            {v8s8, p0, s64, 8},  {v4s16, p0, s64, 8},  {v8s16, p0, s128, 8},
411            {v2s32, p0, s64, 8}, {v4s32, p0, s128, 8}, {v2s64, p0, s128, 8}})
412       .clampScalar(0, s8, s64)
413       .lowerIf([=](const LegalityQuery &Query) {
414         return Query.Types[0].isScalar() &&
415                Query.Types[0] != Query.MMODescrs[0].MemoryTy;
416       })
417       // Maximum: sN * k = 128
418       .clampMaxNumElements(0, s8, 16)
419       .clampMaxNumElements(0, s16, 8)
420       .clampMaxNumElements(0, s32, 4)
421       .clampMaxNumElements(0, s64, 2)
422       .clampMaxNumElements(0, p0, 2)
423       .lowerIfMemSizeNotPow2()
424       .customIf(IsPtrVecPred)
425       .scalarizeIf(typeIs(0, v2s16), 0);
426 
427   getActionDefinitionsBuilder(G_INDEXED_STORE)
428       // Idx 0 == Ptr, Idx 1 == Val
429       // TODO: we can implement legalizations but as of now these are
430       // generated in a very specific way.
431       .legalForTypesWithMemDesc({
432           {p0, s8, s8, 8},
433           {p0, s16, s16, 8},
434           {p0, s32, s8, 8},
435           {p0, s32, s16, 8},
436           {p0, s32, s32, 8},
437           {p0, s64, s64, 8},
438           {p0, p0, p0, 8},
439           {p0, v8s8, v8s8, 8},
440           {p0, v16s8, v16s8, 8},
441           {p0, v4s16, v4s16, 8},
442           {p0, v8s16, v8s16, 8},
443           {p0, v2s32, v2s32, 8},
444           {p0, v4s32, v4s32, 8},
445           {p0, v2s64, v2s64, 8},
446           {p0, v2p0, v2p0, 8},
447           {p0, s128, s128, 8},
448       })
449       .unsupported();
450 
451   auto IndexedLoadBasicPred = [=](const LegalityQuery &Query) {
452     LLT LdTy = Query.Types[0];
453     LLT PtrTy = Query.Types[1];
454     if (!llvm::is_contained(PackedVectorAllTypesVec, LdTy) &&
455         !llvm::is_contained(ScalarAndPtrTypesVec, LdTy) && LdTy != s128)
456       return false;
457     if (PtrTy != p0)
458       return false;
459     return true;
460   };
461   getActionDefinitionsBuilder(G_INDEXED_LOAD)
462       .unsupportedIf(
463           atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered))
464       .legalIf(IndexedLoadBasicPred)
465       .unsupported();
466   getActionDefinitionsBuilder({G_INDEXED_SEXTLOAD, G_INDEXED_ZEXTLOAD})
467       .unsupportedIf(
468           atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered))
469       .legalIf(all(typeInSet(0, {s16, s32, s64}),
470                    LegalityPredicate([=](const LegalityQuery &Q) {
471                      LLT LdTy = Q.Types[0];
472                      LLT PtrTy = Q.Types[1];
473                      LLT MemTy = Q.MMODescrs[0].MemoryTy;
474                      if (PtrTy != p0)
475                        return false;
476                      if (LdTy == s16)
477                        return MemTy == s8;
478                      if (LdTy == s32)
479                        return MemTy == s8 || MemTy == s16;
480                      if (LdTy == s64)
481                        return MemTy == s8 || MemTy == s16 || MemTy == s32;
482                      return false;
483                    })))
484       .unsupported();
485 
486   // Constants
487   getActionDefinitionsBuilder(G_CONSTANT)
488       .legalFor({p0, s8, s16, s32, s64})
489       .widenScalarToNextPow2(0)
490       .clampScalar(0, s8, s64);
491   getActionDefinitionsBuilder(G_FCONSTANT)
492       .legalIf([=](const LegalityQuery &Query) {
493         const auto &Ty = Query.Types[0];
494         if (HasFP16 && Ty == s16)
495           return true;
496         return Ty == s32 || Ty == s64 || Ty == s128;
497       })
498       .clampScalar(0, MinFPScalar, s128);
499 
500   // FIXME: fix moreElementsToNextPow2
501   getActionDefinitionsBuilder(G_ICMP)
502       .legalFor({{s32, s32},
503                  {s32, s64},
504                  {s32, p0},
505                  {v4s32, v4s32},
506                  {v2s32, v2s32},
507                  {v2s64, v2s64},
508                  {v2s64, v2p0},
509                  {v4s16, v4s16},
510                  {v8s16, v8s16},
511                  {v8s8, v8s8},
512                  {v16s8, v16s8}})
513       .widenScalarOrEltToNextPow2(1)
514       .clampScalar(1, s32, s64)
515       .clampScalar(0, s32, s32)
516       .minScalarEltSameAsIf(
517           [=](const LegalityQuery &Query) {
518             const LLT &Ty = Query.Types[0];
519             const LLT &SrcTy = Query.Types[1];
520             return Ty.isVector() && !SrcTy.getElementType().isPointer() &&
521                    Ty.getElementType() != SrcTy.getElementType();
522           },
523           0, 1)
524       .minScalarOrEltIf(
525           [=](const LegalityQuery &Query) { return Query.Types[1] == v2s16; },
526           1, s32)
527       .minScalarOrEltIf(
528           [=](const LegalityQuery &Query) { return Query.Types[1] == v2p0; }, 0,
529           s64)
530       .moreElementsToNextPow2(0)
531       .clampNumElements(0, v8s8, v16s8)
532       .clampNumElements(0, v4s16, v8s16)
533       .clampNumElements(0, v2s32, v4s32)
534       .clampNumElements(0, v2s64, v2s64);
535 
536   getActionDefinitionsBuilder(G_FCMP)
537       // If we don't have full FP16 support, then scalarize the elements of
538       // vectors containing fp16 types.
539       .fewerElementsIf(
540           [=](const LegalityQuery &Query) {
541             const auto &Ty = Query.Types[0];
542             return Ty.isVector() && Ty.getElementType() == s16 && !HasFP16;
543           },
544           [=](const LegalityQuery &Query) { return std::make_pair(0, s16); })
545       // If we don't have full FP16 support, then widen s16 to s32 if we
546       // encounter it.
547       .widenScalarIf(
548           [=](const LegalityQuery &Query) {
549             return Query.Types[0] == s16 && !HasFP16;
550           },
551           [=](const LegalityQuery &Query) { return std::make_pair(0, s32); })
552       .legalFor({{s16, s16},
553                  {s32, s32},
554                  {s32, s64},
555                  {v4s32, v4s32},
556                  {v2s32, v2s32},
557                  {v2s64, v2s64},
558                  {v4s16, v4s16},
559                  {v8s16, v8s16}})
560       .widenScalarOrEltToNextPow2(1)
561       .clampScalar(1, s32, s64)
562       .clampScalar(0, s32, s32)
563       .minScalarEltSameAsIf(
564           [=](const LegalityQuery &Query) {
565             const LLT &Ty = Query.Types[0];
566             const LLT &SrcTy = Query.Types[1];
567             return Ty.isVector() && !SrcTy.getElementType().isPointer() &&
568                    Ty.getElementType() != SrcTy.getElementType();
569           },
570           0, 1)
571       .clampNumElements(0, v2s32, v4s32)
572       .clampMaxNumElements(1, s64, 2);
573 
574   // Extensions
575   auto ExtLegalFunc = [=](const LegalityQuery &Query) {
576     unsigned DstSize = Query.Types[0].getSizeInBits();
577 
578     // Handle legal vectors using legalFor
579     if (Query.Types[0].isVector())
580       return false;
581 
582     if (DstSize < 8 || DstSize >= 128 || !isPowerOf2_32(DstSize))
583       return false; // Extending to a scalar s128 needs narrowing.
584 
585     const LLT &SrcTy = Query.Types[1];
586 
587     // Make sure we fit in a register otherwise. Don't bother checking that
588     // the source type is below 128 bits. We shouldn't be allowing anything
589     // through which is wider than the destination in the first place.
590     unsigned SrcSize = SrcTy.getSizeInBits();
591     if (SrcSize < 8 || !isPowerOf2_32(SrcSize))
592       return false;
593 
594     return true;
595   };
596   getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
597       .legalIf(ExtLegalFunc)
598       .legalFor({{v2s64, v2s32}, {v4s32, v4s16}, {v8s16, v8s8}})
599       .clampScalar(0, s64, s64) // Just for s128, others are handled above.
600       .moreElementsToNextPow2(1)
601       .clampMaxNumElements(1, s8, 8)
602       .clampMaxNumElements(1, s16, 4)
603       .clampMaxNumElements(1, s32, 2)
604       // Tries to convert a large EXTEND into two smaller EXTENDs
605       .lowerIf([=](const LegalityQuery &Query) {
606         return (Query.Types[0].getScalarSizeInBits() >
607                 Query.Types[1].getScalarSizeInBits() * 2) &&
608                Query.Types[0].isVector() &&
609                (Query.Types[1].getScalarSizeInBits() == 8 ||
610                 Query.Types[1].getScalarSizeInBits() == 16);
611       });
612 
613   getActionDefinitionsBuilder(G_TRUNC)
614       .legalFor({{v2s32, v2s64}, {v4s16, v4s32}, {v8s8, v8s16}})
615       .moreElementsToNextPow2(0)
616       .clampMaxNumElements(0, s8, 8)
617       .clampMaxNumElements(0, s16, 4)
618       .clampMaxNumElements(0, s32, 2)
619       .minScalarOrEltIf(
620           [=](const LegalityQuery &Query) { return Query.Types[0].isVector(); },
621           0, s8)
622       .lowerIf([=](const LegalityQuery &Query) {
623         LLT DstTy = Query.Types[0];
624         LLT SrcTy = Query.Types[1];
625         return DstTy.isVector() && SrcTy.getSizeInBits() > 128 &&
626                DstTy.getScalarSizeInBits() * 2 <= SrcTy.getScalarSizeInBits();
627       })
628 
629       .alwaysLegal();
630 
631   getActionDefinitionsBuilder(G_SEXT_INREG)
632       .legalFor({s32, s64})
633       .legalFor(PackedVectorAllTypeList)
634       .maxScalar(0, s64)
635       .clampNumElements(0, v8s8, v16s8)
636       .clampNumElements(0, v4s16, v8s16)
637       .clampNumElements(0, v2s32, v4s32)
638       .clampMaxNumElements(0, s64, 2)
639       .lower();
640 
641   // FP conversions
642   getActionDefinitionsBuilder(G_FPTRUNC)
643       .legalFor(
644           {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}})
645       .clampNumElements(0, v4s16, v4s16)
646       .clampNumElements(0, v2s32, v2s32)
647       .scalarize(0);
648 
649   getActionDefinitionsBuilder(G_FPEXT)
650       .legalFor(
651           {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}})
652       .clampNumElements(0, v4s32, v4s32)
653       .clampNumElements(0, v2s64, v2s64)
654       .scalarize(0);
655 
656   // Conversions
657   getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
658       .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32})
659       .legalIf([=](const LegalityQuery &Query) {
660         return HasFP16 &&
661                (Query.Types[1] == s16 || Query.Types[1] == v4s16 ||
662                 Query.Types[1] == v8s16) &&
663                (Query.Types[0] == s32 || Query.Types[0] == s64 ||
664                 Query.Types[0] == v4s16 || Query.Types[0] == v8s16);
665       })
666       .widenScalarToNextPow2(0)
667       .clampScalar(0, s32, s64)
668       .widenScalarToNextPow2(1)
669       .clampScalarOrElt(1, MinFPScalar, s64)
670       .moreElementsToNextPow2(0)
671       .widenScalarIf(
672           [=](const LegalityQuery &Query) {
673             return Query.Types[0].getScalarSizeInBits() >
674                    Query.Types[1].getScalarSizeInBits();
675           },
676           LegalizeMutations::changeElementSizeTo(1, 0))
677       .widenScalarIf(
678           [=](const LegalityQuery &Query) {
679             return Query.Types[0].getScalarSizeInBits() <
680                    Query.Types[1].getScalarSizeInBits();
681           },
682           LegalizeMutations::changeElementSizeTo(0, 1))
683       .clampNumElements(0, v4s16, v8s16)
684       .clampNumElements(0, v2s32, v4s32)
685       .clampMaxNumElements(0, s64, 2);
686 
687   getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
688       .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32})
689       .legalIf([=](const LegalityQuery &Query) {
690         return HasFP16 &&
691                (Query.Types[0] == s16 || Query.Types[0] == v4s16 ||
692                 Query.Types[0] == v8s16) &&
693                (Query.Types[1] == s32 || Query.Types[1] == s64 ||
694                 Query.Types[1] == v4s16 || Query.Types[1] == v8s16);
695       })
696       .widenScalarToNextPow2(1)
697       .clampScalar(1, s32, s64)
698       .widenScalarToNextPow2(0)
699       .clampScalarOrElt(0, MinFPScalar, s64)
700       .moreElementsToNextPow2(0)
701       .widenScalarIf(
702           [=](const LegalityQuery &Query) {
703             return Query.Types[0].getScalarSizeInBits() <
704                    Query.Types[1].getScalarSizeInBits();
705           },
706           LegalizeMutations::changeElementSizeTo(0, 1))
707       .widenScalarIf(
708           [=](const LegalityQuery &Query) {
709             return Query.Types[0].getScalarSizeInBits() >
710                    Query.Types[1].getScalarSizeInBits();
711           },
712           LegalizeMutations::changeElementSizeTo(1, 0))
713       .clampNumElements(0, v4s16, v8s16)
714       .clampNumElements(0, v2s32, v4s32)
715       .clampMaxNumElements(0, s64, 2);
716 
717   // Control-flow
718   getActionDefinitionsBuilder(G_BRCOND)
719     .legalFor({s32})
720     .clampScalar(0, s32, s32);
721   getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0});
722 
723   getActionDefinitionsBuilder(G_SELECT)
724       .legalFor({{s32, s32}, {s64, s32}, {p0, s32}})
725       .widenScalarToNextPow2(0)
726       .clampScalar(0, s32, s64)
727       .clampScalar(1, s32, s32)
728       .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0)
729       .lowerIf(isVector(0));
730 
731   // Pointer-handling
732   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
733 
734   if (TM.getCodeModel() == CodeModel::Small)
735     getActionDefinitionsBuilder(G_GLOBAL_VALUE).custom();
736   else
737     getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
738 
739   getActionDefinitionsBuilder(G_PTRTOINT)
740       .legalFor({{s64, p0}, {v2s64, v2p0}})
741       .widenScalarToNextPow2(0, 64)
742       .clampScalar(0, s64, s64);
743 
744   getActionDefinitionsBuilder(G_INTTOPTR)
745       .unsupportedIf([&](const LegalityQuery &Query) {
746         return Query.Types[0].getSizeInBits() != Query.Types[1].getSizeInBits();
747       })
748       .legalFor({{p0, s64}, {v2p0, v2s64}});
749 
750   // Casts for 32 and 64-bit width type are just copies.
751   // Same for 128-bit width type, except they are on the FPR bank.
752   getActionDefinitionsBuilder(G_BITCAST)
753       // FIXME: This is wrong since G_BITCAST is not allowed to change the
754       // number of bits but it's what the previous code described and fixing
755       // it breaks tests.
756       .legalForCartesianProduct({s8, s16, s32, s64, s128, v16s8, v8s8, v4s8,
757                                  v8s16, v4s16, v2s16, v4s32, v2s32, v2s64,
758                                  v2p0});
759 
760   getActionDefinitionsBuilder(G_VASTART).legalFor({p0});
761 
762   // va_list must be a pointer, but most sized types are pretty easy to handle
763   // as the destination.
764   getActionDefinitionsBuilder(G_VAARG)
765       .customForCartesianProduct({s8, s16, s32, s64, p0}, {p0})
766       .clampScalar(0, s8, s64)
767       .widenScalarToNextPow2(0, /*Min*/ 8);
768 
769   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
770       .lowerIf(
771           all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0)));
772 
773   LegalityPredicate UseOutlineAtomics = [&ST](const LegalityQuery &Query) {
774     return ST.outlineAtomics() && !ST.hasLSE();
775   };
776 
777   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
778       .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0),
779                    predNot(UseOutlineAtomics)))
780       .customIf(all(typeIs(0, s128), predNot(UseOutlineAtomics)))
781       .customIf([UseOutlineAtomics](const LegalityQuery &Query) {
782         return Query.Types[0].getSizeInBits() == 128 &&
783                !UseOutlineAtomics(Query);
784       })
785       .libcallIf(all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(1, p0),
786                      UseOutlineAtomics))
787       .clampScalar(0, s32, s64);
788 
789   getActionDefinitionsBuilder({G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD,
790                                G_ATOMICRMW_SUB, G_ATOMICRMW_AND, G_ATOMICRMW_OR,
791                                G_ATOMICRMW_XOR})
792       .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0),
793                    predNot(UseOutlineAtomics)))
794       .libcallIf(all(typeInSet(0, {s8, s16, s32, s64}), typeIs(1, p0),
795                      UseOutlineAtomics))
796       .clampScalar(0, s32, s64);
797 
798   // Do not outline these atomics operations, as per comment in
799   // AArch64ISelLowering.cpp's shouldExpandAtomicRMWInIR().
800   getActionDefinitionsBuilder(
801       {G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX})
802       .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)))
803       .clampScalar(0, s32, s64);
804 
805   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({p0});
806 
807   // Merge/Unmerge
808   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
809     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
810     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
811     getActionDefinitionsBuilder(Op)
812         .widenScalarToNextPow2(LitTyIdx, 8)
813         .widenScalarToNextPow2(BigTyIdx, 32)
814         .clampScalar(LitTyIdx, s8, s64)
815         .clampScalar(BigTyIdx, s32, s128)
816         .legalIf([=](const LegalityQuery &Q) {
817           switch (Q.Types[BigTyIdx].getSizeInBits()) {
818           case 32:
819           case 64:
820           case 128:
821             break;
822           default:
823             return false;
824           }
825           switch (Q.Types[LitTyIdx].getSizeInBits()) {
826           case 8:
827           case 16:
828           case 32:
829           case 64:
830             return true;
831           default:
832             return false;
833           }
834         });
835   }
836 
837   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
838       .unsupportedIf([=](const LegalityQuery &Query) {
839         const LLT &EltTy = Query.Types[1].getElementType();
840         return Query.Types[0] != EltTy;
841       })
842       .minScalar(2, s64)
843       .customIf([=](const LegalityQuery &Query) {
844         const LLT &VecTy = Query.Types[1];
845         return VecTy == v2s16 || VecTy == v4s16 || VecTy == v8s16 ||
846                VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32 ||
847                VecTy == v8s8 || VecTy == v16s8 || VecTy == v2p0;
848       })
849       .minScalarOrEltIf(
850           [=](const LegalityQuery &Query) {
851             // We want to promote to <M x s1> to <M x s64> if that wouldn't
852             // cause the total vec size to be > 128b.
853             return Query.Types[1].getNumElements() <= 2;
854           },
855           0, s64)
856       .minScalarOrEltIf(
857           [=](const LegalityQuery &Query) {
858             return Query.Types[1].getNumElements() <= 4;
859           },
860           0, s32)
861       .minScalarOrEltIf(
862           [=](const LegalityQuery &Query) {
863             return Query.Types[1].getNumElements() <= 8;
864           },
865           0, s16)
866       .minScalarOrEltIf(
867           [=](const LegalityQuery &Query) {
868             return Query.Types[1].getNumElements() <= 16;
869           },
870           0, s8)
871       .minScalarOrElt(0, s8) // Worst case, we need at least s8.
872       .moreElementsToNextPow2(1)
873       .clampMaxNumElements(1, s64, 2)
874       .clampMaxNumElements(1, s32, 4)
875       .clampMaxNumElements(1, s16, 8)
876       .clampMaxNumElements(1, p0, 2);
877 
878   getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT)
879       .legalIf(typeInSet(0, {v16s8, v8s8, v8s16, v4s16, v4s32, v2s32, v2s64}))
880       .widenVectorEltsToVectorMinSize(0, 64);
881 
882   getActionDefinitionsBuilder(G_BUILD_VECTOR)
883       .legalFor({{v8s8, s8},
884                  {v16s8, s8},
885                  {v4s16, s16},
886                  {v8s16, s16},
887                  {v2s32, s32},
888                  {v4s32, s32},
889                  {v2p0, p0},
890                  {v2s64, s64}})
891       .clampNumElements(0, v4s32, v4s32)
892       .clampNumElements(0, v2s64, v2s64)
893       .minScalarOrElt(0, s8)
894       .widenVectorEltsToVectorMinSize(0, 64)
895       .minScalarSameAs(1, 0);
896 
897   getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC).lower();
898 
899   getActionDefinitionsBuilder(G_CTLZ)
900       .legalForCartesianProduct(
901           {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
902       .scalarize(1)
903       .widenScalarToNextPow2(1, /*Min=*/32)
904       .clampScalar(1, s32, s64)
905       .scalarSameSizeAs(0, 1);
906   getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).lower();
907 
908   // TODO: Custom lowering for v2s32, v4s32, v2s64.
909   getActionDefinitionsBuilder(G_BITREVERSE)
910       .legalFor({s32, s64, v8s8, v16s8})
911       .widenScalarToNextPow2(0, /*Min = */ 32)
912       .clampScalar(0, s32, s64);
913 
914   getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF).lower();
915 
916   getActionDefinitionsBuilder(G_CTTZ)
917       .lowerIf(isVector(0))
918       .widenScalarToNextPow2(1, /*Min=*/32)
919       .clampScalar(1, s32, s64)
920       .scalarSameSizeAs(0, 1)
921       .legalIf([=](const LegalityQuery &Query) {
922         return (HasCSSC && typeInSet(0, {s32, s64})(Query));
923       })
924       .customIf([=](const LegalityQuery &Query) {
925         return (!HasCSSC && typeInSet(0, {s32, s64})(Query));
926       });
927 
928   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
929       .legalIf([=](const LegalityQuery &Query) {
930         const LLT &DstTy = Query.Types[0];
931         const LLT &SrcTy = Query.Types[1];
932         // For now just support the TBL2 variant which needs the source vectors
933         // to be the same size as the dest.
934         if (DstTy != SrcTy)
935           return false;
936         return llvm::is_contained(
937             {v2s64, v2p0, v2s32, v4s32, v4s16, v16s8, v8s8, v8s16}, DstTy);
938       })
939       // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we
940       // just want those lowered into G_BUILD_VECTOR
941       .lowerIf([=](const LegalityQuery &Query) {
942         return !Query.Types[1].isVector();
943       })
944       .moreElementsIf(
945           [](const LegalityQuery &Query) {
946             return Query.Types[0].isVector() && Query.Types[1].isVector() &&
947                    Query.Types[0].getNumElements() >
948                        Query.Types[1].getNumElements();
949           },
950           changeTo(1, 0))
951       .moreElementsToNextPow2(0)
952       .clampNumElements(0, v4s32, v4s32)
953       .clampNumElements(0, v2s64, v2s64)
954       .moreElementsIf(
955           [](const LegalityQuery &Query) {
956             return Query.Types[0].isVector() && Query.Types[1].isVector() &&
957                    Query.Types[0].getNumElements() <
958                        Query.Types[1].getNumElements();
959           },
960           changeTo(0, 1));
961 
962   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
963       .legalFor({{v4s32, v2s32}, {v8s16, v4s16}, {v16s8, v8s8}});
964 
965   getActionDefinitionsBuilder(G_JUMP_TABLE).legalFor({p0});
966 
967   getActionDefinitionsBuilder(G_BRJT).legalFor({{p0, s64}});
968 
969   getActionDefinitionsBuilder(G_DYN_STACKALLOC).custom();
970 
971   getActionDefinitionsBuilder({G_STACKSAVE, G_STACKRESTORE}).lower();
972 
973   if (ST.hasMOPS()) {
974     // G_BZERO is not supported. Currently it is only emitted by
975     // PreLegalizerCombiner for G_MEMSET with zero constant.
976     getActionDefinitionsBuilder(G_BZERO).unsupported();
977 
978     getActionDefinitionsBuilder(G_MEMSET)
979         .legalForCartesianProduct({p0}, {s64}, {s64})
980         .customForCartesianProduct({p0}, {s8}, {s64})
981         .immIdx(0); // Inform verifier imm idx 0 is handled.
982 
983     getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE})
984         .legalForCartesianProduct({p0}, {p0}, {s64})
985         .immIdx(0); // Inform verifier imm idx 0 is handled.
986 
987     // G_MEMCPY_INLINE does not have a tailcall immediate
988     getActionDefinitionsBuilder(G_MEMCPY_INLINE)
989         .legalForCartesianProduct({p0}, {p0}, {s64});
990 
991   } else {
992     getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET})
993         .libcall();
994   }
995 
996   // FIXME: Legal vector types are only legal with NEON.
997   auto &ABSActions = getActionDefinitionsBuilder(G_ABS);
998   if (HasCSSC)
999     ABSActions
1000         .legalFor({s32, s64});
1001   ABSActions
1002       .legalFor(PackedVectorAllTypeList)
1003       .lowerIf(isScalar(0));
1004 
1005   // For fadd reductions we have pairwise operations available. We treat the
1006   // usual legal types as legal and handle the lowering to pairwise instructions
1007   // later.
1008   getActionDefinitionsBuilder(G_VECREDUCE_FADD)
1009       .legalFor({{s32, v2s32}, {s32, v4s32}, {s64, v2s64}})
1010       .legalIf([=](const LegalityQuery &Query) {
1011         const auto &Ty = Query.Types[1];
1012         return (Ty == v4s16 || Ty == v8s16) && HasFP16;
1013       })
1014       .minScalarOrElt(0, MinFPScalar)
1015       .clampMaxNumElements(1, s64, 2)
1016       .clampMaxNumElements(1, s32, 4)
1017       .clampMaxNumElements(1, s16, 8)
1018       .lower();
1019 
1020   // For fmul reductions we need to split up into individual operations. We
1021   // clamp to 128 bit vectors then to 64bit vectors to produce a cascade of
1022   // smaller types, followed by scalarizing what remains.
1023   getActionDefinitionsBuilder(G_VECREDUCE_FMUL)
1024       .minScalarOrElt(0, MinFPScalar)
1025       .clampMaxNumElements(1, s64, 2)
1026       .clampMaxNumElements(1, s32, 4)
1027       .clampMaxNumElements(1, s16, 8)
1028       .clampMaxNumElements(1, s32, 2)
1029       .clampMaxNumElements(1, s16, 4)
1030       .scalarize(1)
1031       .lower();
1032 
1033   getActionDefinitionsBuilder({G_VECREDUCE_SEQ_FADD, G_VECREDUCE_SEQ_FMUL})
1034       .scalarize(2)
1035       .lower();
1036 
1037   getActionDefinitionsBuilder(G_VECREDUCE_ADD)
1038       .legalFor({{s8, v16s8},
1039                  {s8, v8s8},
1040                  {s16, v8s16},
1041                  {s16, v4s16},
1042                  {s32, v4s32},
1043                  {s32, v2s32},
1044                  {s64, v2s64}})
1045       .clampMaxNumElements(1, s64, 2)
1046       .clampMaxNumElements(1, s32, 4)
1047       .clampMaxNumElements(1, s16, 8)
1048       .clampMaxNumElements(1, s8, 16)
1049       .lower();
1050 
1051   getActionDefinitionsBuilder({G_VECREDUCE_FMIN, G_VECREDUCE_FMAX,
1052                                G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM})
1053       .legalFor({{s32, v4s32}, {s32, v2s32}, {s64, v2s64}})
1054       .legalIf([=](const LegalityQuery &Query) {
1055         const auto &Ty = Query.Types[1];
1056         return Query.Types[0] == s16 && (Ty == v8s16 || Ty == v4s16) && HasFP16;
1057       })
1058       .minScalarOrElt(0, MinFPScalar)
1059       .clampMaxNumElements(1, s64, 2)
1060       .clampMaxNumElements(1, s32, 4)
1061       .clampMaxNumElements(1, s16, 8)
1062       .lower();
1063 
1064   getActionDefinitionsBuilder(G_VECREDUCE_MUL)
1065       .clampMaxNumElements(1, s32, 2)
1066       .clampMaxNumElements(1, s16, 4)
1067       .clampMaxNumElements(1, s8, 8)
1068       .scalarize(1)
1069       .lower();
1070 
1071   getActionDefinitionsBuilder(
1072       {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX})
1073       .legalFor({{s8, v8s8},
1074                  {s8, v16s8},
1075                  {s16, v4s16},
1076                  {s16, v8s16},
1077                  {s32, v2s32},
1078                  {s32, v4s32}})
1079       .clampMaxNumElements(1, s64, 2)
1080       .clampMaxNumElements(1, s32, 4)
1081       .clampMaxNumElements(1, s16, 8)
1082       .clampMaxNumElements(1, s8, 16)
1083       .scalarize(1)
1084       .lower();
1085 
1086   getActionDefinitionsBuilder(
1087       {G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
1088       // Try to break down into smaller vectors as long as they're at least 64
1089       // bits. This lets us use vector operations for some parts of the
1090       // reduction.
1091       .fewerElementsIf(
1092           [=](const LegalityQuery &Q) {
1093             LLT SrcTy = Q.Types[1];
1094             if (SrcTy.isScalar())
1095               return false;
1096             if (!isPowerOf2_32(SrcTy.getNumElements()))
1097               return false;
1098             // We can usually perform 64b vector operations.
1099             return SrcTy.getSizeInBits() > 64;
1100           },
1101           [=](const LegalityQuery &Q) {
1102             LLT SrcTy = Q.Types[1];
1103             return std::make_pair(1, SrcTy.divide(2));
1104           })
1105       .scalarize(1)
1106       .lower();
1107 
1108   getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
1109       .lowerIf([=](const LegalityQuery &Q) { return Q.Types[0].isScalar(); });
1110 
1111   getActionDefinitionsBuilder({G_FSHL, G_FSHR})
1112       .customFor({{s32, s32}, {s32, s64}, {s64, s64}})
1113       .lower();
1114 
1115   getActionDefinitionsBuilder(G_ROTR)
1116       .legalFor({{s32, s64}, {s64, s64}})
1117       .customIf([=](const LegalityQuery &Q) {
1118         return Q.Types[0].isScalar() && Q.Types[1].getScalarSizeInBits() < 64;
1119       })
1120       .lower();
1121   getActionDefinitionsBuilder(G_ROTL).lower();
1122 
1123   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1124       .customFor({{s32, s32}, {s64, s64}});
1125 
1126   auto always = [=](const LegalityQuery &Q) { return true; };
1127   auto &CTPOPActions = getActionDefinitionsBuilder(G_CTPOP);
1128   if (HasCSSC)
1129     CTPOPActions
1130         .legalFor({{s32, s32},
1131                    {s64, s64},
1132                    {v8s8, v8s8},
1133                    {v16s8, v16s8}})
1134         .customFor({{s128, s128},
1135                     {v2s64, v2s64},
1136                     {v2s32, v2s32},
1137                     {v4s32, v4s32},
1138                     {v4s16, v4s16},
1139                     {v8s16, v8s16}});
1140   else
1141     CTPOPActions
1142         .legalFor({{v8s8, v8s8},
1143                    {v16s8, v16s8}})
1144         .customFor({{s32, s32},
1145                     {s64, s64},
1146                     {s128, s128},
1147                     {v2s64, v2s64},
1148                     {v2s32, v2s32},
1149                     {v4s32, v4s32},
1150                     {v4s16, v4s16},
1151                     {v8s16, v8s16}});
1152   CTPOPActions
1153       .clampScalar(0, s32, s128)
1154       .widenScalarToNextPow2(0)
1155       .minScalarEltSameAsIf(always, 1, 0)
1156       .maxScalarEltSameAsIf(always, 1, 0);
1157 
1158   // TODO: Vector types.
1159   getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}).lowerIf(isScalar(0));
1160 
1161   // TODO: Libcall support for s128.
1162   // TODO: s16 should be legal with full FP16 support.
1163   getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1164       .legalFor({{s64, s32}, {s64, s64}});
1165 
1166   // TODO: Custom legalization for vector types.
1167   // TODO: Custom legalization for mismatched types.
1168   // TODO: s16 support.
1169   getActionDefinitionsBuilder(G_FCOPYSIGN).customFor({{s32, s32}, {s64, s64}});
1170 
1171   getActionDefinitionsBuilder(G_FMAD).lower();
1172 
1173   // Access to floating-point environment.
1174   getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV, G_RESET_FPENV,
1175                                G_GET_FPMODE, G_SET_FPMODE, G_RESET_FPMODE})
1176       .libcall();
1177 
1178   getActionDefinitionsBuilder(G_IS_FPCLASS).lower();
1179 
1180   getActionDefinitionsBuilder(G_PREFETCH).custom();
1181 
1182   getLegacyLegalizerInfo().computeTables();
1183   verify(*ST.getInstrInfo());
1184 }
1185 
1186 bool AArch64LegalizerInfo::legalizeCustom(
1187     LegalizerHelper &Helper, MachineInstr &MI,
1188     LostDebugLocObserver &LocObserver) const {
1189   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1190   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
1191   GISelChangeObserver &Observer = Helper.Observer;
1192   switch (MI.getOpcode()) {
1193   default:
1194     // No idea what to do.
1195     return false;
1196   case TargetOpcode::G_VAARG:
1197     return legalizeVaArg(MI, MRI, MIRBuilder);
1198   case TargetOpcode::G_LOAD:
1199   case TargetOpcode::G_STORE:
1200     return legalizeLoadStore(MI, MRI, MIRBuilder, Observer);
1201   case TargetOpcode::G_SHL:
1202   case TargetOpcode::G_ASHR:
1203   case TargetOpcode::G_LSHR:
1204     return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer);
1205   case TargetOpcode::G_GLOBAL_VALUE:
1206     return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer);
1207   case TargetOpcode::G_SBFX:
1208   case TargetOpcode::G_UBFX:
1209     return legalizeBitfieldExtract(MI, MRI, Helper);
1210   case TargetOpcode::G_FSHL:
1211   case TargetOpcode::G_FSHR:
1212     return legalizeFunnelShift(MI, MRI, MIRBuilder, Observer, Helper);
1213   case TargetOpcode::G_ROTR:
1214     return legalizeRotate(MI, MRI, Helper);
1215   case TargetOpcode::G_CTPOP:
1216     return legalizeCTPOP(MI, MRI, Helper);
1217   case TargetOpcode::G_ATOMIC_CMPXCHG:
1218     return legalizeAtomicCmpxchg128(MI, MRI, Helper);
1219   case TargetOpcode::G_CTTZ:
1220     return legalizeCTTZ(MI, Helper);
1221   case TargetOpcode::G_BZERO:
1222   case TargetOpcode::G_MEMCPY:
1223   case TargetOpcode::G_MEMMOVE:
1224   case TargetOpcode::G_MEMSET:
1225     return legalizeMemOps(MI, Helper);
1226   case TargetOpcode::G_FCOPYSIGN:
1227     return legalizeFCopySign(MI, Helper);
1228   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1229     return legalizeExtractVectorElt(MI, MRI, Helper);
1230   case TargetOpcode::G_DYN_STACKALLOC:
1231     return legalizeDynStackAlloc(MI, Helper);
1232   case TargetOpcode::G_PREFETCH:
1233     return legalizePrefetch(MI, Helper);
1234   }
1235 
1236   llvm_unreachable("expected switch to return");
1237 }
1238 
1239 bool AArch64LegalizerInfo::legalizeFunnelShift(MachineInstr &MI,
1240                                                MachineRegisterInfo &MRI,
1241                                                MachineIRBuilder &MIRBuilder,
1242                                                GISelChangeObserver &Observer,
1243                                                LegalizerHelper &Helper) const {
1244   assert(MI.getOpcode() == TargetOpcode::G_FSHL ||
1245          MI.getOpcode() == TargetOpcode::G_FSHR);
1246 
1247   // Keep as G_FSHR if shift amount is a G_CONSTANT, else use generic
1248   // lowering
1249   Register ShiftNo = MI.getOperand(3).getReg();
1250   LLT ShiftTy = MRI.getType(ShiftNo);
1251   auto VRegAndVal = getIConstantVRegValWithLookThrough(ShiftNo, MRI);
1252 
1253   // Adjust shift amount according to Opcode (FSHL/FSHR)
1254   // Convert FSHL to FSHR
1255   LLT OperationTy = MRI.getType(MI.getOperand(0).getReg());
1256   APInt BitWidth(ShiftTy.getSizeInBits(), OperationTy.getSizeInBits(), false);
1257 
1258   // Lower non-constant shifts and leave zero shifts to the optimizer.
1259   if (!VRegAndVal || VRegAndVal->Value.urem(BitWidth) == 0)
1260     return (Helper.lowerFunnelShiftAsShifts(MI) ==
1261             LegalizerHelper::LegalizeResult::Legalized);
1262 
1263   APInt Amount = VRegAndVal->Value.urem(BitWidth);
1264 
1265   Amount = MI.getOpcode() == TargetOpcode::G_FSHL ? BitWidth - Amount : Amount;
1266 
1267   // If the instruction is G_FSHR, has a 64-bit G_CONSTANT for shift amount
1268   // in the range of 0 <-> BitWidth, it is legal
1269   if (ShiftTy.getSizeInBits() == 64 && MI.getOpcode() == TargetOpcode::G_FSHR &&
1270       VRegAndVal->Value.ult(BitWidth))
1271     return true;
1272 
1273   // Cast the ShiftNumber to a 64-bit type
1274   auto Cast64 = MIRBuilder.buildConstant(LLT::scalar(64), Amount.zext(64));
1275 
1276   if (MI.getOpcode() == TargetOpcode::G_FSHR) {
1277     Observer.changingInstr(MI);
1278     MI.getOperand(3).setReg(Cast64.getReg(0));
1279     Observer.changedInstr(MI);
1280   }
1281   // If Opcode is FSHL, remove the FSHL instruction and create a FSHR
1282   // instruction
1283   else if (MI.getOpcode() == TargetOpcode::G_FSHL) {
1284     MIRBuilder.buildInstr(TargetOpcode::G_FSHR, {MI.getOperand(0).getReg()},
1285                           {MI.getOperand(1).getReg(), MI.getOperand(2).getReg(),
1286                            Cast64.getReg(0)});
1287     MI.eraseFromParent();
1288   }
1289   return true;
1290 }
1291 
1292 bool AArch64LegalizerInfo::legalizeRotate(MachineInstr &MI,
1293                                           MachineRegisterInfo &MRI,
1294                                           LegalizerHelper &Helper) const {
1295   // To allow for imported patterns to match, we ensure that the rotate amount
1296   // is 64b with an extension.
1297   Register AmtReg = MI.getOperand(2).getReg();
1298   LLT AmtTy = MRI.getType(AmtReg);
1299   (void)AmtTy;
1300   assert(AmtTy.isScalar() && "Expected a scalar rotate");
1301   assert(AmtTy.getSizeInBits() < 64 && "Expected this rotate to be legal");
1302   auto NewAmt = Helper.MIRBuilder.buildZExt(LLT::scalar(64), AmtReg);
1303   Helper.Observer.changingInstr(MI);
1304   MI.getOperand(2).setReg(NewAmt.getReg(0));
1305   Helper.Observer.changedInstr(MI);
1306   return true;
1307 }
1308 
1309 bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(
1310     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
1311     GISelChangeObserver &Observer) const {
1312   assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE);
1313   // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP +
1314   // G_ADD_LOW instructions.
1315   // By splitting this here, we can optimize accesses in the small code model by
1316   // folding in the G_ADD_LOW into the load/store offset.
1317   auto &GlobalOp = MI.getOperand(1);
1318   const auto* GV = GlobalOp.getGlobal();
1319   if (GV->isThreadLocal())
1320     return true; // Don't want to modify TLS vars.
1321 
1322   auto &TM = ST->getTargetLowering()->getTargetMachine();
1323   unsigned OpFlags = ST->ClassifyGlobalReference(GV, TM);
1324 
1325   if (OpFlags & AArch64II::MO_GOT)
1326     return true;
1327 
1328   auto Offset = GlobalOp.getOffset();
1329   Register DstReg = MI.getOperand(0).getReg();
1330   auto ADRP = MIRBuilder.buildInstr(AArch64::ADRP, {LLT::pointer(0, 64)}, {})
1331                   .addGlobalAddress(GV, Offset, OpFlags | AArch64II::MO_PAGE);
1332   // Set the regclass on the dest reg too.
1333   MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
1334 
1335   // MO_TAGGED on the page indicates a tagged address. Set the tag now. We do so
1336   // by creating a MOVK that sets bits 48-63 of the register to (global address
1337   // + 0x100000000 - PC) >> 48. The additional 0x100000000 offset here is to
1338   // prevent an incorrect tag being generated during relocation when the
1339   // global appears before the code section. Without the offset, a global at
1340   // `0x0f00'0000'0000'1000` (i.e. at `0x1000` with tag `0xf`) that's referenced
1341   // by code at `0x2000` would result in `0x0f00'0000'0000'1000 - 0x2000 =
1342   // 0x0eff'ffff'ffff'f000`, meaning the tag would be incorrectly set to `0xe`
1343   // instead of `0xf`.
1344   // This assumes that we're in the small code model so we can assume a binary
1345   // size of <= 4GB, which makes the untagged PC relative offset positive. The
1346   // binary must also be loaded into address range [0, 2^48). Both of these
1347   // properties need to be ensured at runtime when using tagged addresses.
1348   if (OpFlags & AArch64II::MO_TAGGED) {
1349     assert(!Offset &&
1350            "Should not have folded in an offset for a tagged global!");
1351     ADRP = MIRBuilder.buildInstr(AArch64::MOVKXi, {LLT::pointer(0, 64)}, {ADRP})
1352                .addGlobalAddress(GV, 0x100000000,
1353                                  AArch64II::MO_PREL | AArch64II::MO_G3)
1354                .addImm(48);
1355     MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
1356   }
1357 
1358   MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP})
1359       .addGlobalAddress(GV, Offset,
1360                         OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
1361   MI.eraseFromParent();
1362   return true;
1363 }
1364 
1365 bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
1366                                              MachineInstr &MI) const {
1367   Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
1368   switch (IntrinsicID) {
1369   case Intrinsic::vacopy: {
1370     unsigned PtrSize = ST->isTargetILP32() ? 4 : 8;
1371     unsigned VaListSize =
1372       (ST->isTargetDarwin() || ST->isTargetWindows())
1373           ? PtrSize
1374           : ST->isTargetILP32() ? 20 : 32;
1375 
1376     MachineFunction &MF = *MI.getMF();
1377     auto Val = MF.getRegInfo().createGenericVirtualRegister(
1378         LLT::scalar(VaListSize * 8));
1379     MachineIRBuilder MIB(MI);
1380     MIB.buildLoad(Val, MI.getOperand(2),
1381                   *MF.getMachineMemOperand(MachinePointerInfo(),
1382                                            MachineMemOperand::MOLoad,
1383                                            VaListSize, Align(PtrSize)));
1384     MIB.buildStore(Val, MI.getOperand(1),
1385                    *MF.getMachineMemOperand(MachinePointerInfo(),
1386                                             MachineMemOperand::MOStore,
1387                                             VaListSize, Align(PtrSize)));
1388     MI.eraseFromParent();
1389     return true;
1390   }
1391   case Intrinsic::get_dynamic_area_offset: {
1392     MachineIRBuilder &MIB = Helper.MIRBuilder;
1393     MIB.buildConstant(MI.getOperand(0).getReg(), 0);
1394     MI.eraseFromParent();
1395     return true;
1396   }
1397   case Intrinsic::aarch64_mops_memset_tag: {
1398     assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
1399     // Anyext the value being set to 64 bit (only the bottom 8 bits are read by
1400     // the instruction).
1401     MachineIRBuilder MIB(MI);
1402     auto &Value = MI.getOperand(3);
1403     Register ExtValueReg = MIB.buildAnyExt(LLT::scalar(64), Value).getReg(0);
1404     Value.setReg(ExtValueReg);
1405     return true;
1406   }
1407   case Intrinsic::aarch64_prefetch: {
1408     MachineIRBuilder MIB(MI);
1409     auto &AddrVal = MI.getOperand(1);
1410 
1411     int64_t IsWrite = MI.getOperand(2).getImm();
1412     int64_t Target = MI.getOperand(3).getImm();
1413     int64_t IsStream = MI.getOperand(4).getImm();
1414     int64_t IsData = MI.getOperand(5).getImm();
1415 
1416     unsigned PrfOp = (IsWrite << 4) |    // Load/Store bit
1417                      (!IsData << 3) |    // IsDataCache bit
1418                      (Target << 1) |     // Cache level bits
1419                      (unsigned)IsStream; // Stream bit
1420 
1421     MIB.buildInstr(AArch64::G_AARCH64_PREFETCH).addImm(PrfOp).add(AddrVal);
1422     MI.eraseFromParent();
1423     return true;
1424   }
1425   case Intrinsic::aarch64_neon_uaddv:
1426   case Intrinsic::aarch64_neon_saddv:
1427   case Intrinsic::aarch64_neon_umaxv:
1428   case Intrinsic::aarch64_neon_smaxv:
1429   case Intrinsic::aarch64_neon_uminv:
1430   case Intrinsic::aarch64_neon_sminv: {
1431     MachineIRBuilder MIB(MI);
1432     MachineRegisterInfo &MRI = *MIB.getMRI();
1433     bool IsSigned = IntrinsicID == Intrinsic::aarch64_neon_saddv ||
1434                     IntrinsicID == Intrinsic::aarch64_neon_smaxv ||
1435                     IntrinsicID == Intrinsic::aarch64_neon_sminv;
1436 
1437     auto OldDst = MI.getOperand(0).getReg();
1438     auto OldDstTy = MRI.getType(OldDst);
1439     LLT NewDstTy = MRI.getType(MI.getOperand(2).getReg()).getElementType();
1440     if (OldDstTy == NewDstTy)
1441       return true;
1442 
1443     auto NewDst = MRI.createGenericVirtualRegister(NewDstTy);
1444 
1445     Helper.Observer.changingInstr(MI);
1446     MI.getOperand(0).setReg(NewDst);
1447     Helper.Observer.changedInstr(MI);
1448 
1449     MIB.setInsertPt(MIB.getMBB(), ++MIB.getInsertPt());
1450     MIB.buildExtOrTrunc(IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT,
1451                         OldDst, NewDst);
1452 
1453     return true;
1454   }
1455   case Intrinsic::aarch64_neon_uaddlp:
1456   case Intrinsic::aarch64_neon_saddlp: {
1457     MachineIRBuilder MIB(MI);
1458 
1459     unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlp
1460                        ? AArch64::G_UADDLP
1461                        : AArch64::G_SADDLP;
1462     MIB.buildInstr(Opc, {MI.getOperand(0)}, {MI.getOperand(2)});
1463     MI.eraseFromParent();
1464 
1465     return true;
1466   }
1467   case Intrinsic::aarch64_neon_uaddlv:
1468   case Intrinsic::aarch64_neon_saddlv: {
1469     MachineIRBuilder MIB(MI);
1470     MachineRegisterInfo &MRI = *MIB.getMRI();
1471 
1472     unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlv
1473                        ? AArch64::G_UADDLV
1474                        : AArch64::G_SADDLV;
1475     Register DstReg = MI.getOperand(0).getReg();
1476     Register SrcReg = MI.getOperand(2).getReg();
1477     LLT DstTy = MRI.getType(DstReg);
1478 
1479     LLT MidTy, ExtTy;
1480     if (DstTy.isScalar() && DstTy.getScalarSizeInBits() <= 32) {
1481       MidTy = LLT::fixed_vector(4, 32);
1482       ExtTy = LLT::scalar(32);
1483     } else {
1484       MidTy = LLT::fixed_vector(2, 64);
1485       ExtTy = LLT::scalar(64);
1486     }
1487 
1488     Register MidReg =
1489         MIB.buildInstr(Opc, {MidTy}, {SrcReg})->getOperand(0).getReg();
1490     Register ZeroReg =
1491         MIB.buildConstant(LLT::scalar(64), 0)->getOperand(0).getReg();
1492     Register ExtReg = MIB.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT, {ExtTy},
1493                                      {MidReg, ZeroReg})
1494                           .getReg(0);
1495 
1496     if (DstTy.getScalarSizeInBits() < 32)
1497       MIB.buildTrunc(DstReg, ExtReg);
1498     else
1499       MIB.buildCopy(DstReg, ExtReg);
1500 
1501     MI.eraseFromParent();
1502 
1503     return true;
1504   }
1505   case Intrinsic::aarch64_neon_smax:
1506   case Intrinsic::aarch64_neon_smin:
1507   case Intrinsic::aarch64_neon_umax:
1508   case Intrinsic::aarch64_neon_umin:
1509   case Intrinsic::aarch64_neon_fmax:
1510   case Intrinsic::aarch64_neon_fmin:
1511   case Intrinsic::aarch64_neon_fmaxnm:
1512   case Intrinsic::aarch64_neon_fminnm: {
1513     MachineIRBuilder MIB(MI);
1514     if (IntrinsicID == Intrinsic::aarch64_neon_smax)
1515       MIB.buildSMax(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3));
1516     else if (IntrinsicID == Intrinsic::aarch64_neon_smin)
1517       MIB.buildSMin(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3));
1518     else if (IntrinsicID == Intrinsic::aarch64_neon_umax)
1519       MIB.buildUMax(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3));
1520     else if (IntrinsicID == Intrinsic::aarch64_neon_umin)
1521       MIB.buildUMin(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3));
1522     else if (IntrinsicID == Intrinsic::aarch64_neon_fmax)
1523       MIB.buildInstr(TargetOpcode::G_FMAXIMUM, {MI.getOperand(0)},
1524                      {MI.getOperand(2), MI.getOperand(3)});
1525     else if (IntrinsicID == Intrinsic::aarch64_neon_fmin)
1526       MIB.buildInstr(TargetOpcode::G_FMINIMUM, {MI.getOperand(0)},
1527                      {MI.getOperand(2), MI.getOperand(3)});
1528     else if (IntrinsicID == Intrinsic::aarch64_neon_fmaxnm)
1529       MIB.buildInstr(TargetOpcode::G_FMAXNUM, {MI.getOperand(0)},
1530                      {MI.getOperand(2), MI.getOperand(3)});
1531     else if (IntrinsicID == Intrinsic::aarch64_neon_fminnm)
1532       MIB.buildInstr(TargetOpcode::G_FMINNUM, {MI.getOperand(0)},
1533                      {MI.getOperand(2), MI.getOperand(3)});
1534     MI.eraseFromParent();
1535     return true;
1536   }
1537   case Intrinsic::experimental_vector_reverse:
1538     // TODO: Add support for vector_reverse
1539     return false;
1540   }
1541 
1542   return true;
1543 }
1544 
1545 bool AArch64LegalizerInfo::legalizeShlAshrLshr(
1546     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
1547     GISelChangeObserver &Observer) const {
1548   assert(MI.getOpcode() == TargetOpcode::G_ASHR ||
1549          MI.getOpcode() == TargetOpcode::G_LSHR ||
1550          MI.getOpcode() == TargetOpcode::G_SHL);
1551   // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the
1552   // imported patterns can select it later. Either way, it will be legal.
1553   Register AmtReg = MI.getOperand(2).getReg();
1554   auto VRegAndVal = getIConstantVRegValWithLookThrough(AmtReg, MRI);
1555   if (!VRegAndVal)
1556     return true;
1557   // Check the shift amount is in range for an immediate form.
1558   int64_t Amount = VRegAndVal->Value.getSExtValue();
1559   if (Amount > 31)
1560     return true; // This will have to remain a register variant.
1561   auto ExtCst = MIRBuilder.buildConstant(LLT::scalar(64), Amount);
1562   Observer.changingInstr(MI);
1563   MI.getOperand(2).setReg(ExtCst.getReg(0));
1564   Observer.changedInstr(MI);
1565   return true;
1566 }
1567 
1568 static void matchLDPSTPAddrMode(Register Root, Register &Base, int &Offset,
1569                                 MachineRegisterInfo &MRI) {
1570   Base = Root;
1571   Offset = 0;
1572 
1573   Register NewBase;
1574   int64_t NewOffset;
1575   if (mi_match(Root, MRI, m_GPtrAdd(m_Reg(NewBase), m_ICst(NewOffset))) &&
1576       isShiftedInt<7, 3>(NewOffset)) {
1577     Base = NewBase;
1578     Offset = NewOffset;
1579   }
1580 }
1581 
1582 // FIXME: This should be removed and replaced with the generic bitcast legalize
1583 // action.
1584 bool AArch64LegalizerInfo::legalizeLoadStore(
1585     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
1586     GISelChangeObserver &Observer) const {
1587   assert(MI.getOpcode() == TargetOpcode::G_STORE ||
1588          MI.getOpcode() == TargetOpcode::G_LOAD);
1589   // Here we just try to handle vector loads/stores where our value type might
1590   // have pointer elements, which the SelectionDAG importer can't handle. To
1591   // allow the existing patterns for s64 to fire for p0, we just try to bitcast
1592   // the value to use s64 types.
1593 
1594   // Custom legalization requires the instruction, if not deleted, must be fully
1595   // legalized. In order to allow further legalization of the inst, we create
1596   // a new instruction and erase the existing one.
1597 
1598   Register ValReg = MI.getOperand(0).getReg();
1599   const LLT ValTy = MRI.getType(ValReg);
1600 
1601   if (ValTy == LLT::scalar(128)) {
1602 
1603     AtomicOrdering Ordering = (*MI.memoperands_begin())->getSuccessOrdering();
1604     bool IsLoad = MI.getOpcode() == TargetOpcode::G_LOAD;
1605     bool IsLoadAcquire = IsLoad && Ordering == AtomicOrdering::Acquire;
1606     bool IsStoreRelease = !IsLoad && Ordering == AtomicOrdering::Release;
1607     bool IsRcpC3 =
1608         ST->hasLSE2() && ST->hasRCPC3() && (IsLoadAcquire || IsStoreRelease);
1609 
1610     LLT s64 = LLT::scalar(64);
1611 
1612     unsigned Opcode;
1613     if (IsRcpC3) {
1614       Opcode = IsLoad ? AArch64::LDIAPPX : AArch64::STILPX;
1615     } else {
1616       // For LSE2, loads/stores should have been converted to monotonic and had
1617       // a fence inserted after them.
1618       assert(Ordering == AtomicOrdering::Monotonic ||
1619              Ordering == AtomicOrdering::Unordered);
1620       assert(ST->hasLSE2() && "ldp/stp not single copy atomic without +lse2");
1621 
1622       Opcode = IsLoad ? AArch64::LDPXi : AArch64::STPXi;
1623     }
1624 
1625     MachineInstrBuilder NewI;
1626     if (IsLoad) {
1627       NewI = MIRBuilder.buildInstr(Opcode, {s64, s64}, {});
1628       MIRBuilder.buildMergeLikeInstr(
1629           ValReg, {NewI->getOperand(0), NewI->getOperand(1)});
1630     } else {
1631       auto Split = MIRBuilder.buildUnmerge(s64, MI.getOperand(0));
1632       NewI = MIRBuilder.buildInstr(
1633           Opcode, {}, {Split->getOperand(0), Split->getOperand(1)});
1634     }
1635 
1636     if (IsRcpC3) {
1637       NewI.addUse(MI.getOperand(1).getReg());
1638     } else {
1639       Register Base;
1640       int Offset;
1641       matchLDPSTPAddrMode(MI.getOperand(1).getReg(), Base, Offset, MRI);
1642       NewI.addUse(Base);
1643       NewI.addImm(Offset / 8);
1644     }
1645 
1646     NewI.cloneMemRefs(MI);
1647     constrainSelectedInstRegOperands(*NewI, *ST->getInstrInfo(),
1648                                      *MRI.getTargetRegisterInfo(),
1649                                      *ST->getRegBankInfo());
1650     MI.eraseFromParent();
1651     return true;
1652   }
1653 
1654   if (!ValTy.isVector() || !ValTy.getElementType().isPointer() ||
1655       ValTy.getElementType().getAddressSpace() != 0) {
1656     LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store");
1657     return false;
1658   }
1659 
1660   unsigned PtrSize = ValTy.getElementType().getSizeInBits();
1661   const LLT NewTy = LLT::vector(ValTy.getElementCount(), PtrSize);
1662   auto &MMO = **MI.memoperands_begin();
1663   MMO.setType(NewTy);
1664 
1665   if (MI.getOpcode() == TargetOpcode::G_STORE) {
1666     auto Bitcast = MIRBuilder.buildBitcast(NewTy, ValReg);
1667     MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1), MMO);
1668   } else {
1669     auto NewLoad = MIRBuilder.buildLoad(NewTy, MI.getOperand(1), MMO);
1670     MIRBuilder.buildBitcast(ValReg, NewLoad);
1671   }
1672   MI.eraseFromParent();
1673   return true;
1674 }
1675 
1676 bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
1677                                          MachineRegisterInfo &MRI,
1678                                          MachineIRBuilder &MIRBuilder) const {
1679   MachineFunction &MF = MIRBuilder.getMF();
1680   Align Alignment(MI.getOperand(2).getImm());
1681   Register Dst = MI.getOperand(0).getReg();
1682   Register ListPtr = MI.getOperand(1).getReg();
1683 
1684   LLT PtrTy = MRI.getType(ListPtr);
1685   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1686 
1687   const unsigned PtrSize = PtrTy.getSizeInBits() / 8;
1688   const Align PtrAlign = Align(PtrSize);
1689   auto List = MIRBuilder.buildLoad(
1690       PtrTy, ListPtr,
1691       *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
1692                                PtrTy, PtrAlign));
1693 
1694   MachineInstrBuilder DstPtr;
1695   if (Alignment > PtrAlign) {
1696     // Realign the list to the actual required alignment.
1697     auto AlignMinus1 =
1698         MIRBuilder.buildConstant(IntPtrTy, Alignment.value() - 1);
1699     auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0));
1700     DstPtr = MIRBuilder.buildMaskLowPtrBits(PtrTy, ListTmp, Log2(Alignment));
1701   } else
1702     DstPtr = List;
1703 
1704   LLT ValTy = MRI.getType(Dst);
1705   uint64_t ValSize = ValTy.getSizeInBits() / 8;
1706   MIRBuilder.buildLoad(
1707       Dst, DstPtr,
1708       *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
1709                                ValTy, std::max(Alignment, PtrAlign)));
1710 
1711   auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrAlign));
1712 
1713   auto NewList = MIRBuilder.buildPtrAdd(PtrTy, DstPtr, Size.getReg(0));
1714 
1715   MIRBuilder.buildStore(NewList, ListPtr,
1716                         *MF.getMachineMemOperand(MachinePointerInfo(),
1717                                                  MachineMemOperand::MOStore,
1718                                                  PtrTy, PtrAlign));
1719 
1720   MI.eraseFromParent();
1721   return true;
1722 }
1723 
1724 bool AArch64LegalizerInfo::legalizeBitfieldExtract(
1725     MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
1726   // Only legal if we can select immediate forms.
1727   // TODO: Lower this otherwise.
1728   return getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI) &&
1729          getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
1730 }
1731 
1732 bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI,
1733                                          MachineRegisterInfo &MRI,
1734                                          LegalizerHelper &Helper) const {
1735   // When there is no integer popcount instruction (FEAT_CSSC isn't available),
1736   // it can be more efficiently lowered to the following sequence that uses
1737   // AdvSIMD registers/instructions as long as the copies to/from the AdvSIMD
1738   // registers are cheap.
1739   //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
1740   //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
1741   //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
1742   //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
1743   //
1744   // For 128 bit vector popcounts, we lower to the following sequence:
1745   //  cnt.16b   v0, v0  // v8s16, v4s32, v2s64
1746   //  uaddlp.8h v0, v0  // v8s16, v4s32, v2s64
1747   //  uaddlp.4s v0, v0  //        v4s32, v2s64
1748   //  uaddlp.2d v0, v0  //               v2s64
1749   //
1750   // For 64 bit vector popcounts, we lower to the following sequence:
1751   //  cnt.8b    v0, v0  // v4s16, v2s32
1752   //  uaddlp.4h v0, v0  // v4s16, v2s32
1753   //  uaddlp.2s v0, v0  //        v2s32
1754 
1755   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1756   Register Dst = MI.getOperand(0).getReg();
1757   Register Val = MI.getOperand(1).getReg();
1758   LLT Ty = MRI.getType(Val);
1759   unsigned Size = Ty.getSizeInBits();
1760 
1761   assert(Ty == MRI.getType(Dst) &&
1762          "Expected src and dst to have the same type!");
1763 
1764   if (ST->hasCSSC() && Ty.isScalar() && Size == 128) {
1765     LLT s64 = LLT::scalar(64);
1766 
1767     auto Split = MIRBuilder.buildUnmerge(s64, Val);
1768     auto CTPOP1 = MIRBuilder.buildCTPOP(s64, Split->getOperand(0));
1769     auto CTPOP2 = MIRBuilder.buildCTPOP(s64, Split->getOperand(1));
1770     auto Add = MIRBuilder.buildAdd(s64, CTPOP1, CTPOP2);
1771 
1772     MIRBuilder.buildZExt(Dst, Add);
1773     MI.eraseFromParent();
1774     return true;
1775   }
1776 
1777   if (!ST->hasNEON() ||
1778       MI.getMF()->getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) {
1779     // Use generic lowering when custom lowering is not possible.
1780     return Ty.isScalar() && (Size == 32 || Size == 64) &&
1781            Helper.lowerBitCount(MI) ==
1782                LegalizerHelper::LegalizeResult::Legalized;
1783   }
1784 
1785   // Pre-conditioning: widen Val up to the nearest vector type.
1786   // s32,s64,v4s16,v2s32 -> v8i8
1787   // v8s16,v4s32,v2s64 -> v16i8
1788   LLT VTy = Size == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8);
1789   if (Ty.isScalar()) {
1790     assert((Size == 32 || Size == 64 || Size == 128) && "Expected only 32, 64, or 128 bit scalars!");
1791     if (Size == 32) {
1792       Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0);
1793     }
1794   }
1795   Val = MIRBuilder.buildBitcast(VTy, Val).getReg(0);
1796 
1797   // Count bits in each byte-sized lane.
1798   auto CTPOP = MIRBuilder.buildCTPOP(VTy, Val);
1799 
1800   // Sum across lanes.
1801   Register HSum = CTPOP.getReg(0);
1802   unsigned Opc;
1803   SmallVector<LLT> HAddTys;
1804   if (Ty.isScalar()) {
1805     Opc = Intrinsic::aarch64_neon_uaddlv;
1806     HAddTys.push_back(LLT::scalar(32));
1807   } else if (Ty == LLT::fixed_vector(8, 16)) {
1808     Opc = Intrinsic::aarch64_neon_uaddlp;
1809     HAddTys.push_back(LLT::fixed_vector(8, 16));
1810   } else if (Ty == LLT::fixed_vector(4, 32)) {
1811     Opc = Intrinsic::aarch64_neon_uaddlp;
1812     HAddTys.push_back(LLT::fixed_vector(8, 16));
1813     HAddTys.push_back(LLT::fixed_vector(4, 32));
1814   } else if (Ty == LLT::fixed_vector(2, 64)) {
1815     Opc = Intrinsic::aarch64_neon_uaddlp;
1816     HAddTys.push_back(LLT::fixed_vector(8, 16));
1817     HAddTys.push_back(LLT::fixed_vector(4, 32));
1818     HAddTys.push_back(LLT::fixed_vector(2, 64));
1819   } else if (Ty == LLT::fixed_vector(4, 16)) {
1820     Opc = Intrinsic::aarch64_neon_uaddlp;
1821     HAddTys.push_back(LLT::fixed_vector(4, 16));
1822   } else if (Ty == LLT::fixed_vector(2, 32)) {
1823     Opc = Intrinsic::aarch64_neon_uaddlp;
1824     HAddTys.push_back(LLT::fixed_vector(4, 16));
1825     HAddTys.push_back(LLT::fixed_vector(2, 32));
1826   } else
1827     llvm_unreachable("unexpected vector shape");
1828   MachineInstrBuilder UADD;
1829   for (LLT HTy : HAddTys) {
1830     UADD = MIRBuilder.buildIntrinsic(Opc, {HTy}).addUse(HSum);
1831     HSum = UADD.getReg(0);
1832   }
1833 
1834   // Post-conditioning.
1835   if (Ty.isScalar() && (Size == 64 || Size == 128))
1836     MIRBuilder.buildZExt(Dst, UADD);
1837   else
1838     UADD->getOperand(0).setReg(Dst);
1839   MI.eraseFromParent();
1840   return true;
1841 }
1842 
1843 bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128(
1844     MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
1845   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1846   LLT s64 = LLT::scalar(64);
1847   auto Addr = MI.getOperand(1).getReg();
1848   auto DesiredI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(2));
1849   auto NewI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(3));
1850   auto DstLo = MRI.createGenericVirtualRegister(s64);
1851   auto DstHi = MRI.createGenericVirtualRegister(s64);
1852 
1853   MachineInstrBuilder CAS;
1854   if (ST->hasLSE()) {
1855     // We have 128-bit CASP instructions taking XSeqPair registers, which are
1856     // s128. We need the merge/unmerge to bracket the expansion and pair up with
1857     // the rest of the MIR so we must reassemble the extracted registers into a
1858     // 128-bit known-regclass one with code like this:
1859     //
1860     //     %in1 = REG_SEQUENCE Lo, Hi    ; One for each input
1861     //     %out = CASP %in1, ...
1862     //     %OldLo = G_EXTRACT %out, 0
1863     //     %OldHi = G_EXTRACT %out, 64
1864     auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
1865     unsigned Opcode;
1866     switch (Ordering) {
1867     case AtomicOrdering::Acquire:
1868       Opcode = AArch64::CASPAX;
1869       break;
1870     case AtomicOrdering::Release:
1871       Opcode = AArch64::CASPLX;
1872       break;
1873     case AtomicOrdering::AcquireRelease:
1874     case AtomicOrdering::SequentiallyConsistent:
1875       Opcode = AArch64::CASPALX;
1876       break;
1877     default:
1878       Opcode = AArch64::CASPX;
1879       break;
1880     }
1881 
1882     LLT s128 = LLT::scalar(128);
1883     auto CASDst = MRI.createGenericVirtualRegister(s128);
1884     auto CASDesired = MRI.createGenericVirtualRegister(s128);
1885     auto CASNew = MRI.createGenericVirtualRegister(s128);
1886     MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASDesired}, {})
1887         .addUse(DesiredI->getOperand(0).getReg())
1888         .addImm(AArch64::sube64)
1889         .addUse(DesiredI->getOperand(1).getReg())
1890         .addImm(AArch64::subo64);
1891     MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASNew}, {})
1892         .addUse(NewI->getOperand(0).getReg())
1893         .addImm(AArch64::sube64)
1894         .addUse(NewI->getOperand(1).getReg())
1895         .addImm(AArch64::subo64);
1896 
1897     CAS = MIRBuilder.buildInstr(Opcode, {CASDst}, {CASDesired, CASNew, Addr});
1898 
1899     MIRBuilder.buildExtract({DstLo}, {CASDst}, 0);
1900     MIRBuilder.buildExtract({DstHi}, {CASDst}, 64);
1901   } else {
1902     // The -O0 CMP_SWAP_128 is friendlier to generate code for because LDXP/STXP
1903     // can take arbitrary registers so it just has the normal GPR64 operands the
1904     // rest of AArch64 is expecting.
1905     auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
1906     unsigned Opcode;
1907     switch (Ordering) {
1908     case AtomicOrdering::Acquire:
1909       Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
1910       break;
1911     case AtomicOrdering::Release:
1912       Opcode = AArch64::CMP_SWAP_128_RELEASE;
1913       break;
1914     case AtomicOrdering::AcquireRelease:
1915     case AtomicOrdering::SequentiallyConsistent:
1916       Opcode = AArch64::CMP_SWAP_128;
1917       break;
1918     default:
1919       Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
1920       break;
1921     }
1922 
1923     auto Scratch = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
1924     CAS = MIRBuilder.buildInstr(Opcode, {DstLo, DstHi, Scratch},
1925                                 {Addr, DesiredI->getOperand(0),
1926                                  DesiredI->getOperand(1), NewI->getOperand(0),
1927                                  NewI->getOperand(1)});
1928   }
1929 
1930   CAS.cloneMemRefs(MI);
1931   constrainSelectedInstRegOperands(*CAS, *ST->getInstrInfo(),
1932                                    *MRI.getTargetRegisterInfo(),
1933                                    *ST->getRegBankInfo());
1934 
1935   MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {DstLo, DstHi});
1936   MI.eraseFromParent();
1937   return true;
1938 }
1939 
1940 bool AArch64LegalizerInfo::legalizeCTTZ(MachineInstr &MI,
1941                                         LegalizerHelper &Helper) const {
1942   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1943   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
1944   LLT Ty = MRI.getType(MI.getOperand(1).getReg());
1945   auto BitReverse = MIRBuilder.buildBitReverse(Ty, MI.getOperand(1));
1946   MIRBuilder.buildCTLZ(MI.getOperand(0).getReg(), BitReverse);
1947   MI.eraseFromParent();
1948   return true;
1949 }
1950 
1951 bool AArch64LegalizerInfo::legalizeMemOps(MachineInstr &MI,
1952                                           LegalizerHelper &Helper) const {
1953   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1954 
1955   // Tagged version MOPSMemorySetTagged is legalised in legalizeIntrinsic
1956   if (MI.getOpcode() == TargetOpcode::G_MEMSET) {
1957     // Anyext the value being set to 64 bit (only the bottom 8 bits are read by
1958     // the instruction).
1959     auto &Value = MI.getOperand(1);
1960     Register ExtValueReg =
1961         MIRBuilder.buildAnyExt(LLT::scalar(64), Value).getReg(0);
1962     Value.setReg(ExtValueReg);
1963     return true;
1964   }
1965 
1966   return false;
1967 }
1968 
1969 bool AArch64LegalizerInfo::legalizeFCopySign(MachineInstr &MI,
1970                                              LegalizerHelper &Helper) const {
1971   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1972   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
1973   Register Dst = MI.getOperand(0).getReg();
1974   LLT DstTy = MRI.getType(Dst);
1975   assert(DstTy.isScalar() && "Only expected scalars right now!");
1976   const unsigned DstSize = DstTy.getSizeInBits();
1977   assert((DstSize == 32 || DstSize == 64) && "Unexpected dst type!");
1978   assert(MRI.getType(MI.getOperand(2).getReg()) == DstTy &&
1979          "Expected homogeneous types!");
1980 
1981   // We want to materialize a mask with the high bit set.
1982   uint64_t EltMask;
1983   LLT VecTy;
1984 
1985   // TODO: s16 support.
1986   switch (DstSize) {
1987   default:
1988     llvm_unreachable("Unexpected type for G_FCOPYSIGN!");
1989   case 64: {
1990     // AdvSIMD immediate moves cannot materialize out mask in a single
1991     // instruction for 64-bit elements. Instead, materialize zero and then
1992     // negate it.
1993     EltMask = 0;
1994     VecTy = LLT::fixed_vector(2, DstTy);
1995     break;
1996   }
1997   case 32:
1998     EltMask = 0x80000000ULL;
1999     VecTy = LLT::fixed_vector(4, DstTy);
2000     break;
2001   }
2002 
2003   // Widen In1 and In2 to 128 bits. We want these to eventually become
2004   // INSERT_SUBREGs.
2005   auto Undef = MIRBuilder.buildUndef(VecTy);
2006   auto Zero = MIRBuilder.buildConstant(DstTy, 0);
2007   auto Ins1 = MIRBuilder.buildInsertVectorElement(
2008       VecTy, Undef, MI.getOperand(1).getReg(), Zero);
2009   auto Ins2 = MIRBuilder.buildInsertVectorElement(
2010       VecTy, Undef, MI.getOperand(2).getReg(), Zero);
2011 
2012   // Construct the mask.
2013   auto Mask = MIRBuilder.buildConstant(VecTy, EltMask);
2014   if (DstSize == 64)
2015     Mask = MIRBuilder.buildFNeg(VecTy, Mask);
2016 
2017   auto Sel = MIRBuilder.buildInstr(AArch64::G_BSP, {VecTy}, {Mask, Ins2, Ins1});
2018 
2019   // Build an unmerge whose 0th elt is the original G_FCOPYSIGN destination. We
2020   // want this to eventually become an EXTRACT_SUBREG.
2021   SmallVector<Register, 2> DstRegs(1, Dst);
2022   for (unsigned I = 1, E = VecTy.getNumElements(); I < E; ++I)
2023     DstRegs.push_back(MRI.createGenericVirtualRegister(DstTy));
2024   MIRBuilder.buildUnmerge(DstRegs, Sel);
2025   MI.eraseFromParent();
2026   return true;
2027 }
2028 
2029 bool AArch64LegalizerInfo::legalizeExtractVectorElt(
2030     MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
2031   assert(MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT);
2032   auto VRegAndVal =
2033       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2034   if (VRegAndVal)
2035     return true;
2036   return Helper.lowerExtractInsertVectorElt(MI) !=
2037          LegalizerHelper::LegalizeResult::UnableToLegalize;
2038 }
2039 
2040 bool AArch64LegalizerInfo::legalizeDynStackAlloc(
2041     MachineInstr &MI, LegalizerHelper &Helper) const {
2042   MachineFunction &MF = *MI.getParent()->getParent();
2043   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2044   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
2045 
2046   // If stack probing is not enabled for this function, use the default
2047   // lowering.
2048   if (!MF.getFunction().hasFnAttribute("probe-stack") ||
2049       MF.getFunction().getFnAttribute("probe-stack").getValueAsString() !=
2050           "inline-asm") {
2051     Helper.lowerDynStackAlloc(MI);
2052     return true;
2053   }
2054 
2055   Register Dst = MI.getOperand(0).getReg();
2056   Register AllocSize = MI.getOperand(1).getReg();
2057   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
2058 
2059   assert(MRI.getType(Dst) == LLT::pointer(0, 64) &&
2060          "Unexpected type for dynamic alloca");
2061   assert(MRI.getType(AllocSize) == LLT::scalar(64) &&
2062          "Unexpected type for dynamic alloca");
2063 
2064   LLT PtrTy = MRI.getType(Dst);
2065   Register SPReg =
2066       Helper.getTargetLowering().getStackPointerRegisterToSaveRestore();
2067   Register SPTmp =
2068       Helper.getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy);
2069   auto NewMI =
2070       MIRBuilder.buildInstr(AArch64::PROBED_STACKALLOC_DYN, {}, {SPTmp});
2071   MRI.setRegClass(NewMI.getReg(0), &AArch64::GPR64commonRegClass);
2072   MIRBuilder.setInsertPt(*NewMI->getParent(), NewMI);
2073   MIRBuilder.buildCopy(Dst, SPTmp);
2074 
2075   MI.eraseFromParent();
2076   return true;
2077 }
2078 
2079 bool AArch64LegalizerInfo::legalizePrefetch(MachineInstr &MI,
2080                                             LegalizerHelper &Helper) const {
2081   MachineIRBuilder &MIB = Helper.MIRBuilder;
2082   auto &AddrVal = MI.getOperand(0);
2083 
2084   int64_t IsWrite = MI.getOperand(1).getImm();
2085   int64_t Locality = MI.getOperand(2).getImm();
2086   int64_t IsData = MI.getOperand(3).getImm();
2087 
2088   bool IsStream = Locality == 0;
2089   if (Locality != 0) {
2090     assert(Locality <= 3 && "Prefetch locality out-of-range");
2091     // The locality degree is the opposite of the cache speed.
2092     // Put the number the other way around.
2093     // The encoding starts at 0 for level 1
2094     Locality = 3 - Locality;
2095   }
2096 
2097   unsigned PrfOp = (IsWrite << 4) | (!IsData << 3) | (Locality << 1) | IsStream;
2098 
2099   MIB.buildInstr(AArch64::G_AARCH64_PREFETCH).addImm(PrfOp).add(AddrVal);
2100   MI.eraseFromParent();
2101   return true;
2102 }
2103