xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1 //===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AArch64.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AArch64LegalizerInfo.h"
15 #include "AArch64RegisterBankInfo.h"
16 #include "AArch64Subtarget.h"
17 #include "llvm/ADT/STLExtras.h"
18 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
19 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
20 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
21 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/Utils.h"
24 #include "llvm/CodeGen/MachineInstr.h"
25 #include "llvm/CodeGen/MachineRegisterInfo.h"
26 #include "llvm/CodeGen/TargetOpcodes.h"
27 #include "llvm/CodeGen/ValueTypes.h"
28 #include "llvm/IR/DerivedTypes.h"
29 #include "llvm/IR/Intrinsics.h"
30 #include "llvm/IR/IntrinsicsAArch64.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/MathExtras.h"
33 #include <initializer_list>
34 
35 #define DEBUG_TYPE "aarch64-legalinfo"
36 
37 using namespace llvm;
38 using namespace LegalizeActions;
39 using namespace LegalizeMutations;
40 using namespace LegalityPredicates;
41 using namespace MIPatternMatch;
42 
43 AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
44     : ST(&ST) {
45   using namespace TargetOpcode;
46   const LLT p0 = LLT::pointer(0, 64);
47   const LLT s8 = LLT::scalar(8);
48   const LLT s16 = LLT::scalar(16);
49   const LLT s32 = LLT::scalar(32);
50   const LLT s64 = LLT::scalar(64);
51   const LLT s128 = LLT::scalar(128);
52   const LLT v16s8 = LLT::fixed_vector(16, 8);
53   const LLT v8s8 = LLT::fixed_vector(8, 8);
54   const LLT v4s8 = LLT::fixed_vector(4, 8);
55   const LLT v2s8 = LLT::fixed_vector(2, 8);
56   const LLT v8s16 = LLT::fixed_vector(8, 16);
57   const LLT v4s16 = LLT::fixed_vector(4, 16);
58   const LLT v2s16 = LLT::fixed_vector(2, 16);
59   const LLT v2s32 = LLT::fixed_vector(2, 32);
60   const LLT v4s32 = LLT::fixed_vector(4, 32);
61   const LLT v2s64 = LLT::fixed_vector(2, 64);
62   const LLT v2p0 = LLT::fixed_vector(2, p0);
63 
64   const LLT nxv16s8 = LLT::scalable_vector(16, s8);
65   const LLT nxv8s16 = LLT::scalable_vector(8, s16);
66   const LLT nxv4s32 = LLT::scalable_vector(4, s32);
67   const LLT nxv2s64 = LLT::scalable_vector(2, s64);
68 
69   std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */
70                                                         v16s8, v8s16, v4s32,
71                                                         v2s64, v2p0,
72                                                         /* End 128bit types */
73                                                         /* Begin 64bit types */
74                                                         v8s8, v4s16, v2s32};
75   std::initializer_list<LLT> ScalarAndPtrTypesList = {s8, s16, s32, s64, p0};
76   SmallVector<LLT, 8> PackedVectorAllTypesVec(PackedVectorAllTypeList);
77   SmallVector<LLT, 8> ScalarAndPtrTypesVec(ScalarAndPtrTypesList);
78 
79   const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine();
80 
81   // FIXME: support subtargets which have neon/fp-armv8 disabled.
82   if (!ST.hasNEON() || !ST.hasFPARMv8()) {
83     getLegacyLegalizerInfo().computeTables();
84     return;
85   }
86 
87   // Some instructions only support s16 if the subtarget has full 16-bit FP
88   // support.
89   const bool HasFP16 = ST.hasFullFP16();
90   const LLT &MinFPScalar = HasFP16 ? s16 : s32;
91 
92   const bool HasCSSC = ST.hasCSSC();
93   const bool HasRCPC3 = ST.hasRCPC3();
94 
95   getActionDefinitionsBuilder(
96       {G_IMPLICIT_DEF, G_FREEZE, G_CONSTANT_FOLD_BARRIER})
97       .legalFor({p0, s8, s16, s32, s64})
98       .legalFor(PackedVectorAllTypeList)
99       .widenScalarToNextPow2(0)
100       .clampScalar(0, s8, s64)
101       .moreElementsToNextPow2(0)
102       .widenVectorEltsToVectorMinSize(0, 64)
103       .clampNumElements(0, v8s8, v16s8)
104       .clampNumElements(0, v4s16, v8s16)
105       .clampNumElements(0, v2s32, v4s32)
106       .clampNumElements(0, v2s64, v2s64);
107 
108   getActionDefinitionsBuilder(G_PHI)
109       .legalFor({p0, s16, s32, s64})
110       .legalFor(PackedVectorAllTypeList)
111       .widenScalarToNextPow2(0)
112       .clampScalar(0, s16, s64)
113       // Maximum: sN * k = 128
114       .clampMaxNumElements(0, s8, 16)
115       .clampMaxNumElements(0, s16, 8)
116       .clampMaxNumElements(0, s32, 4)
117       .clampMaxNumElements(0, s64, 2)
118       .clampMaxNumElements(0, p0, 2);
119 
120   getActionDefinitionsBuilder(G_BSWAP)
121       .legalFor({s32, s64, v4s16, v8s16, v2s32, v4s32, v2s64})
122       .widenScalarOrEltToNextPow2(0, 16)
123       .clampScalar(0, s32, s64)
124       .clampNumElements(0, v4s16, v8s16)
125       .clampNumElements(0, v2s32, v4s32)
126       .clampNumElements(0, v2s64, v2s64)
127       .moreElementsToNextPow2(0);
128 
129   getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
130       .legalFor({s32, s64, v2s32, v2s64, v4s32, v4s16, v8s16, v16s8, v8s8})
131       .widenScalarToNextPow2(0)
132       .clampScalar(0, s32, s64)
133       .clampMaxNumElements(0, s8, 16)
134       .clampMaxNumElements(0, s16, 8)
135       .clampNumElements(0, v2s32, v4s32)
136       .clampNumElements(0, v2s64, v2s64)
137       .minScalarOrEltIf(
138           [=](const LegalityQuery &Query) {
139             return Query.Types[0].getNumElements() <= 2;
140           },
141           0, s32)
142       .minScalarOrEltIf(
143           [=](const LegalityQuery &Query) {
144             return Query.Types[0].getNumElements() <= 4;
145           },
146           0, s16)
147       .minScalarOrEltIf(
148           [=](const LegalityQuery &Query) {
149             return Query.Types[0].getNumElements() <= 16;
150           },
151           0, s8)
152       .moreElementsToNextPow2(0);
153 
154   getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR})
155       .customIf([=](const LegalityQuery &Query) {
156         const auto &SrcTy = Query.Types[0];
157         const auto &AmtTy = Query.Types[1];
158         return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
159                AmtTy.getSizeInBits() == 32;
160       })
161       .legalFor({
162           {s32, s32},
163           {s32, s64},
164           {s64, s64},
165           {v8s8, v8s8},
166           {v16s8, v16s8},
167           {v4s16, v4s16},
168           {v8s16, v8s16},
169           {v2s32, v2s32},
170           {v4s32, v4s32},
171           {v2s64, v2s64},
172       })
173       .widenScalarToNextPow2(0)
174       .clampScalar(1, s32, s64)
175       .clampScalar(0, s32, s64)
176       .clampNumElements(0, v8s8, v16s8)
177       .clampNumElements(0, v4s16, v8s16)
178       .clampNumElements(0, v2s32, v4s32)
179       .clampNumElements(0, v2s64, v2s64)
180       .moreElementsToNextPow2(0)
181       .minScalarSameAs(1, 0);
182 
183   getActionDefinitionsBuilder(G_PTR_ADD)
184       .legalFor({{p0, s64}, {v2p0, v2s64}})
185       .clampScalarOrElt(1, s64, s64)
186       .clampNumElements(0, v2p0, v2p0);
187 
188   getActionDefinitionsBuilder(G_PTRMASK).legalFor({{p0, s64}});
189 
190   getActionDefinitionsBuilder({G_SDIV, G_UDIV})
191       .legalFor({s32, s64})
192       .libcallFor({s128})
193       .clampScalar(0, s32, s64)
194       .widenScalarToNextPow2(0)
195       .scalarize(0);
196 
197   getActionDefinitionsBuilder({G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
198       .lowerFor({s8, s16, s32, s64, v2s64, v4s32, v2s32})
199       .widenScalarOrEltToNextPow2(0)
200       .clampScalarOrElt(0, s32, s64)
201       .clampNumElements(0, v2s32, v4s32)
202       .clampNumElements(0, v2s64, v2s64)
203       .moreElementsToNextPow2(0);
204 
205 
206   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
207       .widenScalarToNextPow2(0, /*Min = */ 32)
208       .clampScalar(0, s32, s64)
209       .lower();
210 
211   getActionDefinitionsBuilder({G_SMULH, G_UMULH})
212       .legalFor({s64, v8s16, v16s8, v4s32})
213       .lower();
214 
215   auto &MinMaxActions = getActionDefinitionsBuilder(
216       {G_SMIN, G_SMAX, G_UMIN, G_UMAX});
217   if (HasCSSC)
218     MinMaxActions
219         .legalFor({s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
220         // Making clamping conditional on CSSC extension as without legal types we
221         // lower to CMP which can fold one of the two sxtb's we'd otherwise need
222         // if we detect a type smaller than 32-bit.
223         .minScalar(0, s32);
224   else
225     MinMaxActions
226         .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32});
227   MinMaxActions
228       .clampNumElements(0, v8s8, v16s8)
229       .clampNumElements(0, v4s16, v8s16)
230       .clampNumElements(0, v2s32, v4s32)
231       // FIXME: This sholdn't be needed as v2s64 types are going to
232       // be expanded anyway, but G_ICMP doesn't support splitting vectors yet
233       .clampNumElements(0, v2s64, v2s64)
234       .lower();
235 
236   getActionDefinitionsBuilder(
237       {G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO})
238       .legalFor({{s32, s32}, {s64, s32}})
239       .clampScalar(0, s32, s64)
240        .clampScalar(1, s32, s64)
241       .widenScalarToNextPow2(0);
242 
243   getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FMA, G_FNEG,
244                                G_FABS, G_FSQRT, G_FMAXNUM, G_FMINNUM,
245                                G_FMAXIMUM, G_FMINIMUM, G_FCEIL, G_FFLOOR,
246                                G_FRINT, G_FNEARBYINT, G_INTRINSIC_TRUNC,
247                                G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
248       .legalFor({MinFPScalar, s32, s64, v2s32, v4s32, v2s64})
249       .legalIf([=](const LegalityQuery &Query) {
250         const auto &Ty = Query.Types[0];
251         return (Ty == v8s16 || Ty == v4s16) && HasFP16;
252       })
253       .libcallFor({s128})
254       .minScalarOrElt(0, MinFPScalar)
255       .clampNumElements(0, v4s16, v8s16)
256       .clampNumElements(0, v2s32, v4s32)
257       .clampNumElements(0, v2s64, v2s64)
258       .moreElementsToNextPow2(0);
259 
260   getActionDefinitionsBuilder(G_FREM)
261       .libcallFor({s32, s64})
262       .minScalar(0, s32)
263       .scalarize(0);
264 
265   getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
266       .legalFor({{s64, MinFPScalar}, {s64, s32}, {s64, s64}})
267       .libcallFor({{s64, s128}})
268       .minScalarOrElt(1, MinFPScalar);
269 
270   getActionDefinitionsBuilder(
271       {G_FCOS, G_FSIN, G_FPOW, G_FLOG, G_FLOG2, G_FLOG10, G_FTAN, G_FEXP,
272        G_FEXP2, G_FEXP10, G_FACOS, G_FASIN, G_FATAN, G_FCOSH, G_FSINH, G_FTANH})
273       // We need a call for these, so we always need to scalarize.
274       .scalarize(0)
275       // Regardless of FP16 support, widen 16-bit elements to 32-bits.
276       .minScalar(0, s32)
277       .libcallFor({s32, s64});
278   getActionDefinitionsBuilder(G_FPOWI)
279       .scalarize(0)
280       .minScalar(0, s32)
281       .libcallFor({{s32, s32}, {s64, s32}});
282 
283   getActionDefinitionsBuilder(G_INSERT)
284       .legalIf(all(typeInSet(0, {s32, s64, p0}),
285                    typeInSet(1, {s8, s16, s32}), smallerThan(1, 0)))
286       .widenScalarToNextPow2(0)
287       .clampScalar(0, s32, s64)
288       .widenScalarToNextPow2(1)
289       .minScalar(1, s8)
290       .maxScalarIf(typeInSet(0, {s32}), 1, s16)
291       .maxScalarIf(typeInSet(0, {s64, p0}), 1, s32);
292 
293   getActionDefinitionsBuilder(G_EXTRACT)
294       .legalIf(all(typeInSet(0, {s16, s32, s64, p0}),
295                    typeInSet(1, {s32, s64, s128, p0}), smallerThan(0, 1)))
296       .widenScalarToNextPow2(1)
297       .clampScalar(1, s32, s128)
298       .widenScalarToNextPow2(0)
299       .minScalar(0, s16)
300       .maxScalarIf(typeInSet(1, {s32}), 0, s16)
301       .maxScalarIf(typeInSet(1, {s64, p0}), 0, s32)
302       .maxScalarIf(typeInSet(1, {s128}), 0, s64);
303 
304 
305   for (unsigned Op : {G_SEXTLOAD, G_ZEXTLOAD}) {
306     auto &Actions =  getActionDefinitionsBuilder(Op);
307 
308     if (Op == G_SEXTLOAD)
309       Actions.lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered));
310 
311     // Atomics have zero extending behavior.
312     Actions
313       .legalForTypesWithMemDesc({{s32, p0, s8, 8},
314                                  {s32, p0, s16, 8},
315                                  {s32, p0, s32, 8},
316                                  {s64, p0, s8, 2},
317                                  {s64, p0, s16, 2},
318                                  {s64, p0, s32, 4},
319                                  {s64, p0, s64, 8},
320                                  {p0, p0, s64, 8},
321                                  {v2s32, p0, s64, 8}})
322       .widenScalarToNextPow2(0)
323       .clampScalar(0, s32, s64)
324       // TODO: We could support sum-of-pow2's but the lowering code doesn't know
325       //       how to do that yet.
326       .unsupportedIfMemSizeNotPow2()
327       // Lower anything left over into G_*EXT and G_LOAD
328       .lower();
329   }
330 
331   auto IsPtrVecPred = [=](const LegalityQuery &Query) {
332     const LLT &ValTy = Query.Types[0];
333     return ValTy.isPointerVector() && ValTy.getAddressSpace() == 0;
334   };
335 
336   auto &LoadActions = getActionDefinitionsBuilder(G_LOAD);
337   auto &StoreActions = getActionDefinitionsBuilder(G_STORE);
338 
339   if (ST.hasSVE()) {
340     LoadActions.legalForTypesWithMemDesc({
341         // 128 bit base sizes
342         {nxv16s8, p0, nxv16s8, 8},
343         {nxv8s16, p0, nxv8s16, 8},
344         {nxv4s32, p0, nxv4s32, 8},
345         {nxv2s64, p0, nxv2s64, 8},
346     });
347 
348     // TODO: Add nxv2p0. Consider bitcastIf.
349     //       See #92130
350     //       https://github.com/llvm/llvm-project/pull/92130#discussion_r1616888461
351     StoreActions.legalForTypesWithMemDesc({
352         // 128 bit base sizes
353         {nxv16s8, p0, nxv16s8, 8},
354         {nxv8s16, p0, nxv8s16, 8},
355         {nxv4s32, p0, nxv4s32, 8},
356         {nxv2s64, p0, nxv2s64, 8},
357     });
358   }
359 
360   LoadActions
361       .customIf([=](const LegalityQuery &Query) {
362         return HasRCPC3 && Query.Types[0] == s128 &&
363                Query.MMODescrs[0].Ordering == AtomicOrdering::Acquire;
364       })
365       .customIf([=](const LegalityQuery &Query) {
366         return Query.Types[0] == s128 &&
367                Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
368       })
369       .legalForTypesWithMemDesc({{s8, p0, s8, 8},
370                                  {s16, p0, s16, 8},
371                                  {s32, p0, s32, 8},
372                                  {s64, p0, s64, 8},
373                                  {p0, p0, s64, 8},
374                                  {s128, p0, s128, 8},
375                                  {v8s8, p0, s64, 8},
376                                  {v16s8, p0, s128, 8},
377                                  {v4s16, p0, s64, 8},
378                                  {v8s16, p0, s128, 8},
379                                  {v2s32, p0, s64, 8},
380                                  {v4s32, p0, s128, 8},
381                                  {v2s64, p0, s128, 8}})
382       // These extends are also legal
383       .legalForTypesWithMemDesc(
384           {{s32, p0, s8, 8}, {s32, p0, s16, 8}, {s64, p0, s32, 8}})
385       .widenScalarToNextPow2(0, /* MinSize = */ 8)
386       .clampMaxNumElements(0, s8, 16)
387       .clampMaxNumElements(0, s16, 8)
388       .clampMaxNumElements(0, s32, 4)
389       .clampMaxNumElements(0, s64, 2)
390       .clampMaxNumElements(0, p0, 2)
391       .lowerIfMemSizeNotByteSizePow2()
392       .clampScalar(0, s8, s64)
393       .narrowScalarIf(
394           [=](const LegalityQuery &Query) {
395             // Clamp extending load results to 32-bits.
396             return Query.Types[0].isScalar() &&
397                    Query.Types[0] != Query.MMODescrs[0].MemoryTy &&
398                    Query.Types[0].getSizeInBits() > 32;
399           },
400           changeTo(0, s32))
401       // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out
402       .bitcastIf(typeInSet(0, {v4s8}),
403                  [=](const LegalityQuery &Query) {
404                    const LLT VecTy = Query.Types[0];
405                    return std::pair(0, LLT::scalar(VecTy.getSizeInBits()));
406                  })
407       .customIf(IsPtrVecPred)
408       .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0);
409 
410   StoreActions
411       .customIf([=](const LegalityQuery &Query) {
412         return HasRCPC3 && Query.Types[0] == s128 &&
413                Query.MMODescrs[0].Ordering == AtomicOrdering::Release;
414       })
415       .customIf([=](const LegalityQuery &Query) {
416         return Query.Types[0] == s128 &&
417                Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
418       })
419       .legalForTypesWithMemDesc(
420           {{s8, p0, s8, 8},     {s16, p0, s8, 8},  // truncstorei8 from s16
421            {s32, p0, s8, 8},                       // truncstorei8 from s32
422            {s64, p0, s8, 8},                       // truncstorei8 from s64
423            {s16, p0, s16, 8},   {s32, p0, s16, 8}, // truncstorei16 from s32
424            {s64, p0, s16, 8},                      // truncstorei16 from s64
425            {s32, p0, s8, 8},    {s32, p0, s16, 8},    {s32, p0, s32, 8},
426            {s64, p0, s64, 8},   {s64, p0, s32, 8}, // truncstorei32 from s64
427            {p0, p0, s64, 8},    {s128, p0, s128, 8},  {v16s8, p0, s128, 8},
428            {v8s8, p0, s64, 8},  {v4s16, p0, s64, 8},  {v8s16, p0, s128, 8},
429            {v2s32, p0, s64, 8}, {v4s32, p0, s128, 8}, {v2s64, p0, s128, 8}})
430       .clampScalar(0, s8, s64)
431       .lowerIf([=](const LegalityQuery &Query) {
432         return Query.Types[0].isScalar() &&
433                Query.Types[0] != Query.MMODescrs[0].MemoryTy;
434       })
435       // Maximum: sN * k = 128
436       .clampMaxNumElements(0, s8, 16)
437       .clampMaxNumElements(0, s16, 8)
438       .clampMaxNumElements(0, s32, 4)
439       .clampMaxNumElements(0, s64, 2)
440       .clampMaxNumElements(0, p0, 2)
441       .lowerIfMemSizeNotPow2()
442       // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out
443       .bitcastIf(typeInSet(0, {v4s8}),
444                  [=](const LegalityQuery &Query) {
445                    const LLT VecTy = Query.Types[0];
446                    return std::pair(0, LLT::scalar(VecTy.getSizeInBits()));
447                  })
448       .customIf(IsPtrVecPred)
449       .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0);
450 
451   getActionDefinitionsBuilder(G_INDEXED_STORE)
452       // Idx 0 == Ptr, Idx 1 == Val
453       // TODO: we can implement legalizations but as of now these are
454       // generated in a very specific way.
455       .legalForTypesWithMemDesc({
456           {p0, s8, s8, 8},
457           {p0, s16, s16, 8},
458           {p0, s32, s8, 8},
459           {p0, s32, s16, 8},
460           {p0, s32, s32, 8},
461           {p0, s64, s64, 8},
462           {p0, p0, p0, 8},
463           {p0, v8s8, v8s8, 8},
464           {p0, v16s8, v16s8, 8},
465           {p0, v4s16, v4s16, 8},
466           {p0, v8s16, v8s16, 8},
467           {p0, v2s32, v2s32, 8},
468           {p0, v4s32, v4s32, 8},
469           {p0, v2s64, v2s64, 8},
470           {p0, v2p0, v2p0, 8},
471           {p0, s128, s128, 8},
472       })
473       .unsupported();
474 
475   auto IndexedLoadBasicPred = [=](const LegalityQuery &Query) {
476     LLT LdTy = Query.Types[0];
477     LLT PtrTy = Query.Types[1];
478     if (!llvm::is_contained(PackedVectorAllTypesVec, LdTy) &&
479         !llvm::is_contained(ScalarAndPtrTypesVec, LdTy) && LdTy != s128)
480       return false;
481     if (PtrTy != p0)
482       return false;
483     return true;
484   };
485   getActionDefinitionsBuilder(G_INDEXED_LOAD)
486       .unsupportedIf(
487           atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered))
488       .legalIf(IndexedLoadBasicPred)
489       .unsupported();
490   getActionDefinitionsBuilder({G_INDEXED_SEXTLOAD, G_INDEXED_ZEXTLOAD})
491       .unsupportedIf(
492           atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered))
493       .legalIf(all(typeInSet(0, {s16, s32, s64}),
494                    LegalityPredicate([=](const LegalityQuery &Q) {
495                      LLT LdTy = Q.Types[0];
496                      LLT PtrTy = Q.Types[1];
497                      LLT MemTy = Q.MMODescrs[0].MemoryTy;
498                      if (PtrTy != p0)
499                        return false;
500                      if (LdTy == s16)
501                        return MemTy == s8;
502                      if (LdTy == s32)
503                        return MemTy == s8 || MemTy == s16;
504                      if (LdTy == s64)
505                        return MemTy == s8 || MemTy == s16 || MemTy == s32;
506                      return false;
507                    })))
508       .unsupported();
509 
510   // Constants
511   getActionDefinitionsBuilder(G_CONSTANT)
512       .legalFor({p0, s8, s16, s32, s64})
513       .widenScalarToNextPow2(0)
514       .clampScalar(0, s8, s64);
515   getActionDefinitionsBuilder(G_FCONSTANT)
516       .legalIf([=](const LegalityQuery &Query) {
517         const auto &Ty = Query.Types[0];
518         if (HasFP16 && Ty == s16)
519           return true;
520         return Ty == s32 || Ty == s64 || Ty == s128;
521       })
522       .clampScalar(0, MinFPScalar, s128);
523 
524   // FIXME: fix moreElementsToNextPow2
525   getActionDefinitionsBuilder(G_ICMP)
526       .legalFor({{s32, s32}, {s32, s64}, {s32, p0}})
527       .widenScalarOrEltToNextPow2(1)
528       .clampScalar(1, s32, s64)
529       .clampScalar(0, s32, s32)
530       .minScalarEltSameAsIf(
531           [=](const LegalityQuery &Query) {
532             const LLT &Ty = Query.Types[0];
533             const LLT &SrcTy = Query.Types[1];
534             return Ty.isVector() && !SrcTy.isPointerVector() &&
535                    Ty.getElementType() != SrcTy.getElementType();
536           },
537           0, 1)
538       .minScalarOrEltIf(
539           [=](const LegalityQuery &Query) { return Query.Types[1] == v2s16; },
540           1, s32)
541       .minScalarOrEltIf(
542           [=](const LegalityQuery &Query) { return Query.Types[1] == v2p0; }, 0,
543           s64)
544       .moreElementsToNextPow2(1)
545       .clampNumElements(1, v8s8, v16s8)
546       .clampNumElements(1, v4s16, v8s16)
547       .clampNumElements(1, v2s32, v4s32)
548       .clampNumElements(1, v2s64, v2s64)
549       .customIf(isVector(0));
550 
551   getActionDefinitionsBuilder(G_FCMP)
552       .legalFor({{s32, MinFPScalar},
553                  {s32, s32},
554                  {s32, s64},
555                  {v4s32, v4s32},
556                  {v2s32, v2s32},
557                  {v2s64, v2s64}})
558       .legalIf([=](const LegalityQuery &Query) {
559         const auto &Ty = Query.Types[1];
560         return (Ty == v8s16 || Ty == v4s16) && Ty == Query.Types[0] && HasFP16;
561       })
562       .widenScalarOrEltToNextPow2(1)
563       .clampScalar(0, s32, s32)
564       .clampScalarOrElt(1, MinFPScalar, s64)
565       .minScalarEltSameAsIf(
566           [=](const LegalityQuery &Query) {
567             const LLT &Ty = Query.Types[0];
568             const LLT &SrcTy = Query.Types[1];
569             return Ty.isVector() && !SrcTy.isPointerVector() &&
570                    Ty.getElementType() != SrcTy.getElementType();
571           },
572           0, 1)
573       .clampNumElements(1, v4s16, v8s16)
574       .clampNumElements(1, v2s32, v4s32)
575       .clampMaxNumElements(1, s64, 2)
576       .moreElementsToNextPow2(1);
577 
578   // Extensions
579   auto ExtLegalFunc = [=](const LegalityQuery &Query) {
580     unsigned DstSize = Query.Types[0].getSizeInBits();
581 
582     // Handle legal vectors using legalFor
583     if (Query.Types[0].isVector())
584       return false;
585 
586     if (DstSize < 8 || DstSize >= 128 || !isPowerOf2_32(DstSize))
587       return false; // Extending to a scalar s128 needs narrowing.
588 
589     const LLT &SrcTy = Query.Types[1];
590 
591     // Make sure we fit in a register otherwise. Don't bother checking that
592     // the source type is below 128 bits. We shouldn't be allowing anything
593     // through which is wider than the destination in the first place.
594     unsigned SrcSize = SrcTy.getSizeInBits();
595     if (SrcSize < 8 || !isPowerOf2_32(SrcSize))
596       return false;
597 
598     return true;
599   };
600   getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
601       .legalIf(ExtLegalFunc)
602       .legalFor({{v2s64, v2s32}, {v4s32, v4s16}, {v8s16, v8s8}})
603       .clampScalar(0, s64, s64) // Just for s128, others are handled above.
604       .moreElementsToNextPow2(0)
605       .clampMaxNumElements(1, s8, 8)
606       .clampMaxNumElements(1, s16, 4)
607       .clampMaxNumElements(1, s32, 2)
608       // Tries to convert a large EXTEND into two smaller EXTENDs
609       .lowerIf([=](const LegalityQuery &Query) {
610         return (Query.Types[0].getScalarSizeInBits() >
611                 Query.Types[1].getScalarSizeInBits() * 2) &&
612                Query.Types[0].isVector() &&
613                (Query.Types[1].getScalarSizeInBits() == 8 ||
614                 Query.Types[1].getScalarSizeInBits() == 16);
615       })
616       .clampMinNumElements(1, s8, 8)
617       .clampMinNumElements(1, s16, 4);
618 
619   getActionDefinitionsBuilder(G_TRUNC)
620       .legalFor({{v2s32, v2s64}, {v4s16, v4s32}, {v8s8, v8s16}})
621       .moreElementsToNextPow2(0)
622       .clampMaxNumElements(0, s8, 8)
623       .clampMaxNumElements(0, s16, 4)
624       .clampMaxNumElements(0, s32, 2)
625       .minScalarOrEltIf(
626           [=](const LegalityQuery &Query) { return Query.Types[0].isVector(); },
627           0, s8)
628       .lowerIf([=](const LegalityQuery &Query) {
629         LLT DstTy = Query.Types[0];
630         LLT SrcTy = Query.Types[1];
631         return DstTy.isVector() && SrcTy.getSizeInBits() > 128 &&
632                DstTy.getScalarSizeInBits() * 2 <= SrcTy.getScalarSizeInBits();
633       })
634       .clampMinNumElements(0, s8, 8)
635       .clampMinNumElements(0, s16, 4)
636       .alwaysLegal();
637 
638   getActionDefinitionsBuilder(G_SEXT_INREG)
639       .legalFor({s32, s64})
640       .legalFor(PackedVectorAllTypeList)
641       .maxScalar(0, s64)
642       .clampNumElements(0, v8s8, v16s8)
643       .clampNumElements(0, v4s16, v8s16)
644       .clampNumElements(0, v2s32, v4s32)
645       .clampMaxNumElements(0, s64, 2)
646       .lower();
647 
648   // FP conversions
649   getActionDefinitionsBuilder(G_FPTRUNC)
650       .legalFor(
651           {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}})
652       .libcallFor({{s16, s128}, {s32, s128}, {s64, s128}})
653       .clampNumElements(0, v4s16, v4s16)
654       .clampNumElements(0, v2s32, v2s32)
655       .scalarize(0);
656 
657   getActionDefinitionsBuilder(G_FPEXT)
658       .legalFor(
659           {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}})
660       .libcallFor({{s128, s64}, {s128, s32}, {s128, s16}})
661       .clampNumElements(0, v4s32, v4s32)
662       .clampNumElements(0, v2s64, v2s64)
663       .scalarize(0);
664 
665   // Conversions
666   getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
667       .legalFor({{s32, s32},
668                  {s64, s32},
669                  {s32, s64},
670                  {s64, s64},
671                  {v2s64, v2s64},
672                  {v4s32, v4s32},
673                  {v2s32, v2s32}})
674       .legalIf([=](const LegalityQuery &Query) {
675         return HasFP16 &&
676                (Query.Types[1] == s16 || Query.Types[1] == v4s16 ||
677                 Query.Types[1] == v8s16) &&
678                (Query.Types[0] == s32 || Query.Types[0] == s64 ||
679                 Query.Types[0] == v4s16 || Query.Types[0] == v8s16);
680       })
681       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
682       .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
683       // The range of a fp16 value fits into an i17, so we can lower the width
684       // to i64.
685       .narrowScalarIf(
686           [=](const LegalityQuery &Query) {
687             return Query.Types[1] == s16 && Query.Types[0].getSizeInBits() > 64;
688           },
689           changeTo(0, s64))
690       .moreElementsToNextPow2(0)
691       .widenScalarOrEltToNextPow2OrMinSize(0)
692       .minScalar(0, s32)
693       .widenScalarOrEltToNextPow2OrMinSize(1, /*MinSize=*/HasFP16 ? 16 : 32)
694       .widenScalarIf(
695           [=](const LegalityQuery &Query) {
696             return Query.Types[0].getScalarSizeInBits() <= 64 &&
697                    Query.Types[0].getScalarSizeInBits() >
698                        Query.Types[1].getScalarSizeInBits();
699           },
700           LegalizeMutations::changeElementSizeTo(1, 0))
701       .widenScalarIf(
702           [=](const LegalityQuery &Query) {
703             return Query.Types[1].getScalarSizeInBits() <= 64 &&
704                    Query.Types[0].getScalarSizeInBits() <
705                        Query.Types[1].getScalarSizeInBits();
706           },
707           LegalizeMutations::changeElementSizeTo(0, 1))
708       .clampNumElements(0, v4s16, v8s16)
709       .clampNumElements(0, v2s32, v4s32)
710       .clampMaxNumElements(0, s64, 2)
711       .libcallFor(
712           {{s32, s128}, {s64, s128}, {s128, s128}, {s128, s32}, {s128, s64}});
713 
714   getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
715       .legalFor({{s32, s32},
716                  {s64, s32},
717                  {s32, s64},
718                  {s64, s64},
719                  {v2s64, v2s64},
720                  {v4s32, v4s32},
721                  {v2s32, v2s32}})
722       .legalIf([=](const LegalityQuery &Query) {
723         return HasFP16 &&
724                (Query.Types[0] == s16 || Query.Types[0] == v4s16 ||
725                 Query.Types[0] == v8s16) &&
726                (Query.Types[1] == s32 || Query.Types[1] == s64 ||
727                 Query.Types[1] == v4s16 || Query.Types[1] == v8s16);
728       })
729       .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
730       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
731       .moreElementsToNextPow2(1)
732       .widenScalarOrEltToNextPow2OrMinSize(1)
733       .minScalar(1, s32)
734       .widenScalarOrEltToNextPow2OrMinSize(0, /*MinSize=*/HasFP16 ? 16 : 32)
735       .widenScalarIf(
736           [=](const LegalityQuery &Query) {
737             return Query.Types[1].getScalarSizeInBits() <= 64 &&
738                    Query.Types[0].getScalarSizeInBits() <
739                        Query.Types[1].getScalarSizeInBits();
740           },
741           LegalizeMutations::changeElementSizeTo(0, 1))
742       .widenScalarIf(
743           [=](const LegalityQuery &Query) {
744             return Query.Types[0].getScalarSizeInBits() <= 64 &&
745                    Query.Types[0].getScalarSizeInBits() >
746                        Query.Types[1].getScalarSizeInBits();
747           },
748           LegalizeMutations::changeElementSizeTo(1, 0))
749       .clampNumElements(0, v4s16, v8s16)
750       .clampNumElements(0, v2s32, v4s32)
751       .clampMaxNumElements(0, s64, 2)
752       .libcallFor({{s16, s128},
753                    {s32, s128},
754                    {s64, s128},
755                    {s128, s128},
756                    {s128, s32},
757                    {s128, s64}});
758 
759   // Control-flow
760   getActionDefinitionsBuilder(G_BRCOND)
761     .legalFor({s32})
762     .clampScalar(0, s32, s32);
763   getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0});
764 
765   getActionDefinitionsBuilder(G_SELECT)
766       .legalFor({{s32, s32}, {s64, s32}, {p0, s32}})
767       .widenScalarToNextPow2(0)
768       .clampScalar(0, s32, s64)
769       .clampScalar(1, s32, s32)
770       .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0)
771       .lowerIf(isVector(0));
772 
773   // Pointer-handling
774   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
775 
776   if (TM.getCodeModel() == CodeModel::Small)
777     getActionDefinitionsBuilder(G_GLOBAL_VALUE).custom();
778   else
779     getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
780 
781   getActionDefinitionsBuilder(G_PTRAUTH_GLOBAL_VALUE)
782       .legalIf(all(typeIs(0, p0), typeIs(1, p0)));
783 
784   getActionDefinitionsBuilder(G_PTRTOINT)
785       .legalFor({{s64, p0}, {v2s64, v2p0}})
786       .widenScalarToNextPow2(0, 64)
787       .clampScalar(0, s64, s64);
788 
789   getActionDefinitionsBuilder(G_INTTOPTR)
790       .unsupportedIf([&](const LegalityQuery &Query) {
791         return Query.Types[0].getSizeInBits() != Query.Types[1].getSizeInBits();
792       })
793       .legalFor({{p0, s64}, {v2p0, v2s64}});
794 
795   // Casts for 32 and 64-bit width type are just copies.
796   // Same for 128-bit width type, except they are on the FPR bank.
797   getActionDefinitionsBuilder(G_BITCAST)
798       // Keeping 32-bit instructions legal to prevent regression in some tests
799       .legalForCartesianProduct({s32, v2s16, v4s8})
800       .legalForCartesianProduct({s64, v8s8, v4s16, v2s32})
801       .legalForCartesianProduct({s128, v16s8, v8s16, v4s32, v2s64, v2p0})
802       .lowerIf([=](const LegalityQuery &Query) {
803         return Query.Types[0].isVector() != Query.Types[1].isVector();
804       })
805       .moreElementsToNextPow2(0)
806       .clampNumElements(0, v8s8, v16s8)
807       .clampNumElements(0, v4s16, v8s16)
808       .clampNumElements(0, v2s32, v4s32)
809       .lower();
810 
811   getActionDefinitionsBuilder(G_VASTART).legalFor({p0});
812 
813   // va_list must be a pointer, but most sized types are pretty easy to handle
814   // as the destination.
815   getActionDefinitionsBuilder(G_VAARG)
816       .customForCartesianProduct({s8, s16, s32, s64, p0}, {p0})
817       .clampScalar(0, s8, s64)
818       .widenScalarToNextPow2(0, /*Min*/ 8);
819 
820   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
821       .lowerIf(
822           all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0)));
823 
824   LegalityPredicate UseOutlineAtomics = [&ST](const LegalityQuery &Query) {
825     return ST.outlineAtomics() && !ST.hasLSE();
826   };
827 
828   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
829       .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0),
830                    predNot(UseOutlineAtomics)))
831       .customIf(all(typeIs(0, s128), predNot(UseOutlineAtomics)))
832       .customIf([UseOutlineAtomics](const LegalityQuery &Query) {
833         return Query.Types[0].getSizeInBits() == 128 &&
834                !UseOutlineAtomics(Query);
835       })
836       .libcallIf(all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(1, p0),
837                      UseOutlineAtomics))
838       .clampScalar(0, s32, s64);
839 
840   getActionDefinitionsBuilder({G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD,
841                                G_ATOMICRMW_SUB, G_ATOMICRMW_AND, G_ATOMICRMW_OR,
842                                G_ATOMICRMW_XOR})
843       .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0),
844                    predNot(UseOutlineAtomics)))
845       .libcallIf(all(typeInSet(0, {s8, s16, s32, s64}), typeIs(1, p0),
846                      UseOutlineAtomics))
847       .clampScalar(0, s32, s64);
848 
849   // Do not outline these atomics operations, as per comment in
850   // AArch64ISelLowering.cpp's shouldExpandAtomicRMWInIR().
851   getActionDefinitionsBuilder(
852       {G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX})
853       .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)))
854       .clampScalar(0, s32, s64);
855 
856   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({p0});
857 
858   // Merge/Unmerge
859   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
860     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
861     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
862     getActionDefinitionsBuilder(Op)
863         .widenScalarToNextPow2(LitTyIdx, 8)
864         .widenScalarToNextPow2(BigTyIdx, 32)
865         .clampScalar(LitTyIdx, s8, s64)
866         .clampScalar(BigTyIdx, s32, s128)
867         .legalIf([=](const LegalityQuery &Q) {
868           switch (Q.Types[BigTyIdx].getSizeInBits()) {
869           case 32:
870           case 64:
871           case 128:
872             break;
873           default:
874             return false;
875           }
876           switch (Q.Types[LitTyIdx].getSizeInBits()) {
877           case 8:
878           case 16:
879           case 32:
880           case 64:
881             return true;
882           default:
883             return false;
884           }
885         });
886   }
887 
888   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
889       .unsupportedIf([=](const LegalityQuery &Query) {
890         const LLT &EltTy = Query.Types[1].getElementType();
891         return Query.Types[0] != EltTy;
892       })
893       .minScalar(2, s64)
894       .customIf([=](const LegalityQuery &Query) {
895         const LLT &VecTy = Query.Types[1];
896         return VecTy == v2s16 || VecTy == v4s16 || VecTy == v8s16 ||
897                VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32 ||
898                VecTy == v8s8 || VecTy == v16s8 || VecTy == v2p0;
899       })
900       .minScalarOrEltIf(
901           [=](const LegalityQuery &Query) {
902             // We want to promote to <M x s1> to <M x s64> if that wouldn't
903             // cause the total vec size to be > 128b.
904             return Query.Types[1].getNumElements() <= 2;
905           },
906           0, s64)
907       .minScalarOrEltIf(
908           [=](const LegalityQuery &Query) {
909             return Query.Types[1].getNumElements() <= 4;
910           },
911           0, s32)
912       .minScalarOrEltIf(
913           [=](const LegalityQuery &Query) {
914             return Query.Types[1].getNumElements() <= 8;
915           },
916           0, s16)
917       .minScalarOrEltIf(
918           [=](const LegalityQuery &Query) {
919             return Query.Types[1].getNumElements() <= 16;
920           },
921           0, s8)
922       .minScalarOrElt(0, s8) // Worst case, we need at least s8.
923       .moreElementsToNextPow2(1)
924       .clampMaxNumElements(1, s64, 2)
925       .clampMaxNumElements(1, s32, 4)
926       .clampMaxNumElements(1, s16, 8)
927       .clampMaxNumElements(1, s8, 16)
928       .clampMaxNumElements(1, p0, 2);
929 
930   getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT)
931       .legalIf(
932           typeInSet(0, {v16s8, v8s8, v8s16, v4s16, v4s32, v2s32, v2s64, v2p0}))
933       .moreElementsToNextPow2(0)
934       .widenVectorEltsToVectorMinSize(0, 64)
935       .clampNumElements(0, v8s8, v16s8)
936       .clampNumElements(0, v4s16, v8s16)
937       .clampNumElements(0, v2s32, v4s32)
938       .clampMaxNumElements(0, s64, 2)
939       .clampMaxNumElements(0, p0, 2);
940 
941   getActionDefinitionsBuilder(G_BUILD_VECTOR)
942       .legalFor({{v8s8, s8},
943                  {v16s8, s8},
944                  {v4s16, s16},
945                  {v8s16, s16},
946                  {v2s32, s32},
947                  {v4s32, s32},
948                  {v2p0, p0},
949                  {v2s64, s64}})
950       .clampNumElements(0, v4s32, v4s32)
951       .clampNumElements(0, v2s64, v2s64)
952       .minScalarOrElt(0, s8)
953       .widenVectorEltsToVectorMinSize(0, 64)
954       .minScalarSameAs(1, 0);
955 
956   getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC).lower();
957 
958   getActionDefinitionsBuilder(G_CTLZ)
959       .legalForCartesianProduct(
960           {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
961       .scalarize(1)
962       .widenScalarToNextPow2(1, /*Min=*/32)
963       .clampScalar(1, s32, s64)
964       .scalarSameSizeAs(0, 1);
965   getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).lower();
966 
967   // TODO: Custom lowering for v2s32, v4s32, v2s64.
968   getActionDefinitionsBuilder(G_BITREVERSE)
969       .legalFor({s32, s64, v8s8, v16s8})
970       .widenScalarToNextPow2(0, /*Min = */ 32)
971       .clampScalar(0, s32, s64);
972 
973   getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF).lower();
974 
975   getActionDefinitionsBuilder(G_CTTZ)
976       .lowerIf(isVector(0))
977       .widenScalarToNextPow2(1, /*Min=*/32)
978       .clampScalar(1, s32, s64)
979       .scalarSameSizeAs(0, 1)
980       .legalIf([=](const LegalityQuery &Query) {
981         return (HasCSSC && typeInSet(0, {s32, s64})(Query));
982       })
983       .customIf([=](const LegalityQuery &Query) {
984         return (!HasCSSC && typeInSet(0, {s32, s64})(Query));
985       });
986 
987   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
988       .legalIf([=](const LegalityQuery &Query) {
989         const LLT &DstTy = Query.Types[0];
990         const LLT &SrcTy = Query.Types[1];
991         // For now just support the TBL2 variant which needs the source vectors
992         // to be the same size as the dest.
993         if (DstTy != SrcTy)
994           return false;
995         return llvm::is_contained(
996             {v2s64, v2p0, v2s32, v4s32, v4s16, v16s8, v8s8, v8s16}, DstTy);
997       })
998       // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we
999       // just want those lowered into G_BUILD_VECTOR
1000       .lowerIf([=](const LegalityQuery &Query) {
1001         return !Query.Types[1].isVector();
1002       })
1003       .moreElementsIf(
1004           [](const LegalityQuery &Query) {
1005             return Query.Types[0].isVector() && Query.Types[1].isVector() &&
1006                    Query.Types[0].getNumElements() >
1007                        Query.Types[1].getNumElements();
1008           },
1009           changeTo(1, 0))
1010       .moreElementsToNextPow2(0)
1011       .moreElementsIf(
1012           [](const LegalityQuery &Query) {
1013             return Query.Types[0].isVector() && Query.Types[1].isVector() &&
1014                    Query.Types[0].getNumElements() <
1015                        Query.Types[1].getNumElements();
1016           },
1017           changeTo(0, 1))
1018       .widenScalarOrEltToNextPow2OrMinSize(0, 8)
1019       .clampNumElements(0, v8s8, v16s8)
1020       .clampNumElements(0, v4s16, v8s16)
1021       .clampNumElements(0, v4s32, v4s32)
1022       .clampNumElements(0, v2s64, v2s64);
1023 
1024   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1025       .legalFor({{v4s32, v2s32}, {v8s16, v4s16}, {v16s8, v8s8}})
1026       .bitcastIf(
1027           [=](const LegalityQuery &Query) {
1028             return Query.Types[0].getSizeInBits() <= 128 &&
1029                    Query.Types[1].getSizeInBits() <= 64;
1030           },
1031           [=](const LegalityQuery &Query) {
1032             const LLT DstTy = Query.Types[0];
1033             const LLT SrcTy = Query.Types[1];
1034             return std::pair(
1035                 0, DstTy.changeElementSize(SrcTy.getSizeInBits())
1036                        .changeElementCount(
1037                            DstTy.getElementCount().divideCoefficientBy(
1038                                SrcTy.getNumElements())));
1039           });
1040 
1041   getActionDefinitionsBuilder(G_JUMP_TABLE).legalFor({p0});
1042 
1043   getActionDefinitionsBuilder(G_BRJT).legalFor({{p0, s64}});
1044 
1045   getActionDefinitionsBuilder(G_DYN_STACKALLOC).custom();
1046 
1047   getActionDefinitionsBuilder({G_STACKSAVE, G_STACKRESTORE}).lower();
1048 
1049   if (ST.hasMOPS()) {
1050     // G_BZERO is not supported. Currently it is only emitted by
1051     // PreLegalizerCombiner for G_MEMSET with zero constant.
1052     getActionDefinitionsBuilder(G_BZERO).unsupported();
1053 
1054     getActionDefinitionsBuilder(G_MEMSET)
1055         .legalForCartesianProduct({p0}, {s64}, {s64})
1056         .customForCartesianProduct({p0}, {s8}, {s64})
1057         .immIdx(0); // Inform verifier imm idx 0 is handled.
1058 
1059     getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE})
1060         .legalForCartesianProduct({p0}, {p0}, {s64})
1061         .immIdx(0); // Inform verifier imm idx 0 is handled.
1062 
1063     // G_MEMCPY_INLINE does not have a tailcall immediate
1064     getActionDefinitionsBuilder(G_MEMCPY_INLINE)
1065         .legalForCartesianProduct({p0}, {p0}, {s64});
1066 
1067   } else {
1068     getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET})
1069         .libcall();
1070   }
1071 
1072   // FIXME: Legal vector types are only legal with NEON.
1073   auto &ABSActions = getActionDefinitionsBuilder(G_ABS);
1074   if (HasCSSC)
1075     ABSActions
1076         .legalFor({s32, s64});
1077   ABSActions.legalFor(PackedVectorAllTypeList)
1078       .customIf([=](const LegalityQuery &Q) {
1079         // TODO: Fix suboptimal codegen for 128+ bit types.
1080         LLT SrcTy = Q.Types[0];
1081         return SrcTy.isScalar() && SrcTy.getSizeInBits() < 128;
1082       })
1083       .widenScalarIf(
1084           [=](const LegalityQuery &Query) { return Query.Types[0] == v4s8; },
1085           [=](const LegalityQuery &Query) { return std::make_pair(0, v4s16); })
1086       .widenScalarIf(
1087           [=](const LegalityQuery &Query) { return Query.Types[0] == v2s16; },
1088           [=](const LegalityQuery &Query) { return std::make_pair(0, v2s32); })
1089       .clampNumElements(0, v8s8, v16s8)
1090       .clampNumElements(0, v4s16, v8s16)
1091       .clampNumElements(0, v2s32, v4s32)
1092       .clampNumElements(0, v2s64, v2s64)
1093       .moreElementsToNextPow2(0)
1094       .lower();
1095 
1096   // For fadd reductions we have pairwise operations available. We treat the
1097   // usual legal types as legal and handle the lowering to pairwise instructions
1098   // later.
1099   getActionDefinitionsBuilder(G_VECREDUCE_FADD)
1100       .legalFor({{s32, v2s32}, {s32, v4s32}, {s64, v2s64}})
1101       .legalIf([=](const LegalityQuery &Query) {
1102         const auto &Ty = Query.Types[1];
1103         return (Ty == v4s16 || Ty == v8s16) && HasFP16;
1104       })
1105       .minScalarOrElt(0, MinFPScalar)
1106       .clampMaxNumElements(1, s64, 2)
1107       .clampMaxNumElements(1, s32, 4)
1108       .clampMaxNumElements(1, s16, 8)
1109       .lower();
1110 
1111   // For fmul reductions we need to split up into individual operations. We
1112   // clamp to 128 bit vectors then to 64bit vectors to produce a cascade of
1113   // smaller types, followed by scalarizing what remains.
1114   getActionDefinitionsBuilder(G_VECREDUCE_FMUL)
1115       .minScalarOrElt(0, MinFPScalar)
1116       .clampMaxNumElements(1, s64, 2)
1117       .clampMaxNumElements(1, s32, 4)
1118       .clampMaxNumElements(1, s16, 8)
1119       .clampMaxNumElements(1, s32, 2)
1120       .clampMaxNumElements(1, s16, 4)
1121       .scalarize(1)
1122       .lower();
1123 
1124   getActionDefinitionsBuilder({G_VECREDUCE_SEQ_FADD, G_VECREDUCE_SEQ_FMUL})
1125       .scalarize(2)
1126       .lower();
1127 
1128   getActionDefinitionsBuilder(G_VECREDUCE_ADD)
1129       .legalFor({{s8, v16s8},
1130                  {s8, v8s8},
1131                  {s16, v8s16},
1132                  {s16, v4s16},
1133                  {s32, v4s32},
1134                  {s32, v2s32},
1135                  {s64, v2s64}})
1136       .clampMaxNumElements(1, s64, 2)
1137       .clampMaxNumElements(1, s32, 4)
1138       .clampMaxNumElements(1, s16, 8)
1139       .clampMaxNumElements(1, s8, 16)
1140       .lower();
1141 
1142   getActionDefinitionsBuilder({G_VECREDUCE_FMIN, G_VECREDUCE_FMAX,
1143                                G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM})
1144       .legalFor({{s32, v4s32}, {s32, v2s32}, {s64, v2s64}})
1145       .legalIf([=](const LegalityQuery &Query) {
1146         const auto &Ty = Query.Types[1];
1147         return Query.Types[0] == s16 && (Ty == v8s16 || Ty == v4s16) && HasFP16;
1148       })
1149       .minScalarOrElt(0, MinFPScalar)
1150       .clampMaxNumElements(1, s64, 2)
1151       .clampMaxNumElements(1, s32, 4)
1152       .clampMaxNumElements(1, s16, 8)
1153       .lower();
1154 
1155   getActionDefinitionsBuilder(G_VECREDUCE_MUL)
1156       .clampMaxNumElements(1, s32, 2)
1157       .clampMaxNumElements(1, s16, 4)
1158       .clampMaxNumElements(1, s8, 8)
1159       .scalarize(1)
1160       .lower();
1161 
1162   getActionDefinitionsBuilder(
1163       {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX})
1164       .legalFor({{s8, v8s8},
1165                  {s8, v16s8},
1166                  {s16, v4s16},
1167                  {s16, v8s16},
1168                  {s32, v2s32},
1169                  {s32, v4s32}})
1170       .moreElementsIf(
1171           [=](const LegalityQuery &Query) {
1172             return Query.Types[1].isVector() &&
1173                    Query.Types[1].getElementType() != s8 &&
1174                    Query.Types[1].getNumElements() & 1;
1175           },
1176           LegalizeMutations::moreElementsToNextPow2(1))
1177       .clampMaxNumElements(1, s64, 2)
1178       .clampMaxNumElements(1, s32, 4)
1179       .clampMaxNumElements(1, s16, 8)
1180       .clampMaxNumElements(1, s8, 16)
1181       .scalarize(1)
1182       .lower();
1183 
1184   getActionDefinitionsBuilder(
1185       {G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
1186       // Try to break down into smaller vectors as long as they're at least 64
1187       // bits. This lets us use vector operations for some parts of the
1188       // reduction.
1189       .fewerElementsIf(
1190           [=](const LegalityQuery &Q) {
1191             LLT SrcTy = Q.Types[1];
1192             if (SrcTy.isScalar())
1193               return false;
1194             if (!isPowerOf2_32(SrcTy.getNumElements()))
1195               return false;
1196             // We can usually perform 64b vector operations.
1197             return SrcTy.getSizeInBits() > 64;
1198           },
1199           [=](const LegalityQuery &Q) {
1200             LLT SrcTy = Q.Types[1];
1201             return std::make_pair(1, SrcTy.divide(2));
1202           })
1203       .scalarize(1)
1204       .lower();
1205 
1206   // TODO: Update this to correct handling when adding AArch64/SVE support.
1207   getActionDefinitionsBuilder(G_VECTOR_COMPRESS).lower();
1208 
1209   getActionDefinitionsBuilder({G_FSHL, G_FSHR})
1210       .customFor({{s32, s32}, {s32, s64}, {s64, s64}})
1211       .lower();
1212 
1213   getActionDefinitionsBuilder(G_ROTR)
1214       .legalFor({{s32, s64}, {s64, s64}})
1215       .customIf([=](const LegalityQuery &Q) {
1216         return Q.Types[0].isScalar() && Q.Types[1].getScalarSizeInBits() < 64;
1217       })
1218       .lower();
1219   getActionDefinitionsBuilder(G_ROTL).lower();
1220 
1221   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1222       .customFor({{s32, s32}, {s64, s64}});
1223 
1224   auto always = [=](const LegalityQuery &Q) { return true; };
1225   auto &CTPOPActions = getActionDefinitionsBuilder(G_CTPOP);
1226   if (HasCSSC)
1227     CTPOPActions
1228         .legalFor({{s32, s32},
1229                    {s64, s64},
1230                    {v8s8, v8s8},
1231                    {v16s8, v16s8}})
1232         .customFor({{s128, s128},
1233                     {v2s64, v2s64},
1234                     {v2s32, v2s32},
1235                     {v4s32, v4s32},
1236                     {v4s16, v4s16},
1237                     {v8s16, v8s16}});
1238   else
1239     CTPOPActions
1240         .legalFor({{v8s8, v8s8},
1241                    {v16s8, v16s8}})
1242         .customFor({{s32, s32},
1243                     {s64, s64},
1244                     {s128, s128},
1245                     {v2s64, v2s64},
1246                     {v2s32, v2s32},
1247                     {v4s32, v4s32},
1248                     {v4s16, v4s16},
1249                     {v8s16, v8s16}});
1250   CTPOPActions
1251       .clampScalar(0, s32, s128)
1252       .widenScalarToNextPow2(0)
1253       .minScalarEltSameAsIf(always, 1, 0)
1254       .maxScalarEltSameAsIf(always, 1, 0);
1255 
1256   getActionDefinitionsBuilder({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT})
1257       .legalFor({v2s64, v2s32, v4s32, v4s16, v8s16, v8s8, v16s8})
1258       .clampNumElements(0, v8s8, v16s8)
1259       .clampNumElements(0, v4s16, v8s16)
1260       .clampNumElements(0, v2s32, v4s32)
1261       .clampMaxNumElements(0, s64, 2)
1262       .moreElementsToNextPow2(0)
1263       .lower();
1264 
1265   // TODO: Libcall support for s128.
1266   // TODO: s16 should be legal with full FP16 support.
1267   getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1268       .legalFor({{s64, s32}, {s64, s64}});
1269 
1270   // TODO: Custom legalization for mismatched types.
1271   getActionDefinitionsBuilder(G_FCOPYSIGN)
1272       .moreElementsIf(
1273           [](const LegalityQuery &Query) { return Query.Types[0].isScalar(); },
1274           [=](const LegalityQuery &Query) {
1275             const LLT Ty = Query.Types[0];
1276             return std::pair(0, LLT::fixed_vector(Ty == s16 ? 4 : 2, Ty));
1277           })
1278       .lower();
1279 
1280   getActionDefinitionsBuilder(G_FMAD).lower();
1281 
1282   // Access to floating-point environment.
1283   getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV, G_RESET_FPENV,
1284                                G_GET_FPMODE, G_SET_FPMODE, G_RESET_FPMODE})
1285       .libcall();
1286 
1287   getActionDefinitionsBuilder(G_IS_FPCLASS).lower();
1288 
1289   getActionDefinitionsBuilder(G_PREFETCH).custom();
1290 
1291   getActionDefinitionsBuilder({G_SCMP, G_UCMP}).lower();
1292 
1293   getLegacyLegalizerInfo().computeTables();
1294   verify(*ST.getInstrInfo());
1295 }
1296 
1297 bool AArch64LegalizerInfo::legalizeCustom(
1298     LegalizerHelper &Helper, MachineInstr &MI,
1299     LostDebugLocObserver &LocObserver) const {
1300   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1301   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
1302   GISelChangeObserver &Observer = Helper.Observer;
1303   switch (MI.getOpcode()) {
1304   default:
1305     // No idea what to do.
1306     return false;
1307   case TargetOpcode::G_VAARG:
1308     return legalizeVaArg(MI, MRI, MIRBuilder);
1309   case TargetOpcode::G_LOAD:
1310   case TargetOpcode::G_STORE:
1311     return legalizeLoadStore(MI, MRI, MIRBuilder, Observer);
1312   case TargetOpcode::G_SHL:
1313   case TargetOpcode::G_ASHR:
1314   case TargetOpcode::G_LSHR:
1315     return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer);
1316   case TargetOpcode::G_GLOBAL_VALUE:
1317     return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer);
1318   case TargetOpcode::G_SBFX:
1319   case TargetOpcode::G_UBFX:
1320     return legalizeBitfieldExtract(MI, MRI, Helper);
1321   case TargetOpcode::G_FSHL:
1322   case TargetOpcode::G_FSHR:
1323     return legalizeFunnelShift(MI, MRI, MIRBuilder, Observer, Helper);
1324   case TargetOpcode::G_ROTR:
1325     return legalizeRotate(MI, MRI, Helper);
1326   case TargetOpcode::G_CTPOP:
1327     return legalizeCTPOP(MI, MRI, Helper);
1328   case TargetOpcode::G_ATOMIC_CMPXCHG:
1329     return legalizeAtomicCmpxchg128(MI, MRI, Helper);
1330   case TargetOpcode::G_CTTZ:
1331     return legalizeCTTZ(MI, Helper);
1332   case TargetOpcode::G_BZERO:
1333   case TargetOpcode::G_MEMCPY:
1334   case TargetOpcode::G_MEMMOVE:
1335   case TargetOpcode::G_MEMSET:
1336     return legalizeMemOps(MI, Helper);
1337   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1338     return legalizeExtractVectorElt(MI, MRI, Helper);
1339   case TargetOpcode::G_DYN_STACKALLOC:
1340     return legalizeDynStackAlloc(MI, Helper);
1341   case TargetOpcode::G_PREFETCH:
1342     return legalizePrefetch(MI, Helper);
1343   case TargetOpcode::G_ABS:
1344     return Helper.lowerAbsToCNeg(MI);
1345   case TargetOpcode::G_ICMP:
1346     return legalizeICMP(MI, MRI, MIRBuilder);
1347   }
1348 
1349   llvm_unreachable("expected switch to return");
1350 }
1351 
1352 bool AArch64LegalizerInfo::legalizeFunnelShift(MachineInstr &MI,
1353                                                MachineRegisterInfo &MRI,
1354                                                MachineIRBuilder &MIRBuilder,
1355                                                GISelChangeObserver &Observer,
1356                                                LegalizerHelper &Helper) const {
1357   assert(MI.getOpcode() == TargetOpcode::G_FSHL ||
1358          MI.getOpcode() == TargetOpcode::G_FSHR);
1359 
1360   // Keep as G_FSHR if shift amount is a G_CONSTANT, else use generic
1361   // lowering
1362   Register ShiftNo = MI.getOperand(3).getReg();
1363   LLT ShiftTy = MRI.getType(ShiftNo);
1364   auto VRegAndVal = getIConstantVRegValWithLookThrough(ShiftNo, MRI);
1365 
1366   // Adjust shift amount according to Opcode (FSHL/FSHR)
1367   // Convert FSHL to FSHR
1368   LLT OperationTy = MRI.getType(MI.getOperand(0).getReg());
1369   APInt BitWidth(ShiftTy.getSizeInBits(), OperationTy.getSizeInBits(), false);
1370 
1371   // Lower non-constant shifts and leave zero shifts to the optimizer.
1372   if (!VRegAndVal || VRegAndVal->Value.urem(BitWidth) == 0)
1373     return (Helper.lowerFunnelShiftAsShifts(MI) ==
1374             LegalizerHelper::LegalizeResult::Legalized);
1375 
1376   APInt Amount = VRegAndVal->Value.urem(BitWidth);
1377 
1378   Amount = MI.getOpcode() == TargetOpcode::G_FSHL ? BitWidth - Amount : Amount;
1379 
1380   // If the instruction is G_FSHR, has a 64-bit G_CONSTANT for shift amount
1381   // in the range of 0 <-> BitWidth, it is legal
1382   if (ShiftTy.getSizeInBits() == 64 && MI.getOpcode() == TargetOpcode::G_FSHR &&
1383       VRegAndVal->Value.ult(BitWidth))
1384     return true;
1385 
1386   // Cast the ShiftNumber to a 64-bit type
1387   auto Cast64 = MIRBuilder.buildConstant(LLT::scalar(64), Amount.zext(64));
1388 
1389   if (MI.getOpcode() == TargetOpcode::G_FSHR) {
1390     Observer.changingInstr(MI);
1391     MI.getOperand(3).setReg(Cast64.getReg(0));
1392     Observer.changedInstr(MI);
1393   }
1394   // If Opcode is FSHL, remove the FSHL instruction and create a FSHR
1395   // instruction
1396   else if (MI.getOpcode() == TargetOpcode::G_FSHL) {
1397     MIRBuilder.buildInstr(TargetOpcode::G_FSHR, {MI.getOperand(0).getReg()},
1398                           {MI.getOperand(1).getReg(), MI.getOperand(2).getReg(),
1399                            Cast64.getReg(0)});
1400     MI.eraseFromParent();
1401   }
1402   return true;
1403 }
1404 
1405 bool AArch64LegalizerInfo::legalizeICMP(MachineInstr &MI,
1406                                         MachineRegisterInfo &MRI,
1407                                         MachineIRBuilder &MIRBuilder) const {
1408   Register DstReg = MI.getOperand(0).getReg();
1409   Register SrcReg1 = MI.getOperand(2).getReg();
1410   Register SrcReg2 = MI.getOperand(3).getReg();
1411   LLT DstTy = MRI.getType(DstReg);
1412   LLT SrcTy = MRI.getType(SrcReg1);
1413 
1414   // Check the vector types are legal
1415   if (DstTy.getScalarSizeInBits() != SrcTy.getScalarSizeInBits() ||
1416       DstTy.getNumElements() != SrcTy.getNumElements() ||
1417       (DstTy.getSizeInBits() != 64 && DstTy.getSizeInBits() != 128))
1418     return false;
1419 
1420   // Lowers G_ICMP NE => G_ICMP EQ to allow better pattern matching for
1421   // following passes
1422   CmpInst::Predicate Pred = (CmpInst::Predicate)MI.getOperand(1).getPredicate();
1423   if (Pred != CmpInst::ICMP_NE)
1424     return true;
1425   Register CmpReg =
1426       MIRBuilder
1427           .buildICmp(CmpInst::ICMP_EQ, MRI.getType(DstReg), SrcReg1, SrcReg2)
1428           .getReg(0);
1429   MIRBuilder.buildNot(DstReg, CmpReg);
1430 
1431   MI.eraseFromParent();
1432   return true;
1433 }
1434 
1435 bool AArch64LegalizerInfo::legalizeRotate(MachineInstr &MI,
1436                                           MachineRegisterInfo &MRI,
1437                                           LegalizerHelper &Helper) const {
1438   // To allow for imported patterns to match, we ensure that the rotate amount
1439   // is 64b with an extension.
1440   Register AmtReg = MI.getOperand(2).getReg();
1441   LLT AmtTy = MRI.getType(AmtReg);
1442   (void)AmtTy;
1443   assert(AmtTy.isScalar() && "Expected a scalar rotate");
1444   assert(AmtTy.getSizeInBits() < 64 && "Expected this rotate to be legal");
1445   auto NewAmt = Helper.MIRBuilder.buildZExt(LLT::scalar(64), AmtReg);
1446   Helper.Observer.changingInstr(MI);
1447   MI.getOperand(2).setReg(NewAmt.getReg(0));
1448   Helper.Observer.changedInstr(MI);
1449   return true;
1450 }
1451 
1452 bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(
1453     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
1454     GISelChangeObserver &Observer) const {
1455   assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE);
1456   // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP +
1457   // G_ADD_LOW instructions.
1458   // By splitting this here, we can optimize accesses in the small code model by
1459   // folding in the G_ADD_LOW into the load/store offset.
1460   auto &GlobalOp = MI.getOperand(1);
1461   // Don't modify an intrinsic call.
1462   if (GlobalOp.isSymbol())
1463     return true;
1464   const auto* GV = GlobalOp.getGlobal();
1465   if (GV->isThreadLocal())
1466     return true; // Don't want to modify TLS vars.
1467 
1468   auto &TM = ST->getTargetLowering()->getTargetMachine();
1469   unsigned OpFlags = ST->ClassifyGlobalReference(GV, TM);
1470 
1471   if (OpFlags & AArch64II::MO_GOT)
1472     return true;
1473 
1474   auto Offset = GlobalOp.getOffset();
1475   Register DstReg = MI.getOperand(0).getReg();
1476   auto ADRP = MIRBuilder.buildInstr(AArch64::ADRP, {LLT::pointer(0, 64)}, {})
1477                   .addGlobalAddress(GV, Offset, OpFlags | AArch64II::MO_PAGE);
1478   // Set the regclass on the dest reg too.
1479   MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
1480 
1481   // MO_TAGGED on the page indicates a tagged address. Set the tag now. We do so
1482   // by creating a MOVK that sets bits 48-63 of the register to (global address
1483   // + 0x100000000 - PC) >> 48. The additional 0x100000000 offset here is to
1484   // prevent an incorrect tag being generated during relocation when the
1485   // global appears before the code section. Without the offset, a global at
1486   // `0x0f00'0000'0000'1000` (i.e. at `0x1000` with tag `0xf`) that's referenced
1487   // by code at `0x2000` would result in `0x0f00'0000'0000'1000 - 0x2000 =
1488   // 0x0eff'ffff'ffff'f000`, meaning the tag would be incorrectly set to `0xe`
1489   // instead of `0xf`.
1490   // This assumes that we're in the small code model so we can assume a binary
1491   // size of <= 4GB, which makes the untagged PC relative offset positive. The
1492   // binary must also be loaded into address range [0, 2^48). Both of these
1493   // properties need to be ensured at runtime when using tagged addresses.
1494   if (OpFlags & AArch64II::MO_TAGGED) {
1495     assert(!Offset &&
1496            "Should not have folded in an offset for a tagged global!");
1497     ADRP = MIRBuilder.buildInstr(AArch64::MOVKXi, {LLT::pointer(0, 64)}, {ADRP})
1498                .addGlobalAddress(GV, 0x100000000,
1499                                  AArch64II::MO_PREL | AArch64II::MO_G3)
1500                .addImm(48);
1501     MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
1502   }
1503 
1504   MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP})
1505       .addGlobalAddress(GV, Offset,
1506                         OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
1507   MI.eraseFromParent();
1508   return true;
1509 }
1510 
1511 bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
1512                                              MachineInstr &MI) const {
1513   Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
1514   switch (IntrinsicID) {
1515   case Intrinsic::vacopy: {
1516     unsigned PtrSize = ST->isTargetILP32() ? 4 : 8;
1517     unsigned VaListSize =
1518       (ST->isTargetDarwin() || ST->isTargetWindows())
1519           ? PtrSize
1520           : ST->isTargetILP32() ? 20 : 32;
1521 
1522     MachineFunction &MF = *MI.getMF();
1523     auto Val = MF.getRegInfo().createGenericVirtualRegister(
1524         LLT::scalar(VaListSize * 8));
1525     MachineIRBuilder MIB(MI);
1526     MIB.buildLoad(Val, MI.getOperand(2),
1527                   *MF.getMachineMemOperand(MachinePointerInfo(),
1528                                            MachineMemOperand::MOLoad,
1529                                            VaListSize, Align(PtrSize)));
1530     MIB.buildStore(Val, MI.getOperand(1),
1531                    *MF.getMachineMemOperand(MachinePointerInfo(),
1532                                             MachineMemOperand::MOStore,
1533                                             VaListSize, Align(PtrSize)));
1534     MI.eraseFromParent();
1535     return true;
1536   }
1537   case Intrinsic::get_dynamic_area_offset: {
1538     MachineIRBuilder &MIB = Helper.MIRBuilder;
1539     MIB.buildConstant(MI.getOperand(0).getReg(), 0);
1540     MI.eraseFromParent();
1541     return true;
1542   }
1543   case Intrinsic::aarch64_mops_memset_tag: {
1544     assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
1545     // Anyext the value being set to 64 bit (only the bottom 8 bits are read by
1546     // the instruction).
1547     MachineIRBuilder MIB(MI);
1548     auto &Value = MI.getOperand(3);
1549     Register ExtValueReg = MIB.buildAnyExt(LLT::scalar(64), Value).getReg(0);
1550     Value.setReg(ExtValueReg);
1551     return true;
1552   }
1553   case Intrinsic::aarch64_prefetch: {
1554     MachineIRBuilder MIB(MI);
1555     auto &AddrVal = MI.getOperand(1);
1556 
1557     int64_t IsWrite = MI.getOperand(2).getImm();
1558     int64_t Target = MI.getOperand(3).getImm();
1559     int64_t IsStream = MI.getOperand(4).getImm();
1560     int64_t IsData = MI.getOperand(5).getImm();
1561 
1562     unsigned PrfOp = (IsWrite << 4) |    // Load/Store bit
1563                      (!IsData << 3) |    // IsDataCache bit
1564                      (Target << 1) |     // Cache level bits
1565                      (unsigned)IsStream; // Stream bit
1566 
1567     MIB.buildInstr(AArch64::G_AARCH64_PREFETCH).addImm(PrfOp).add(AddrVal);
1568     MI.eraseFromParent();
1569     return true;
1570   }
1571   case Intrinsic::aarch64_neon_uaddv:
1572   case Intrinsic::aarch64_neon_saddv:
1573   case Intrinsic::aarch64_neon_umaxv:
1574   case Intrinsic::aarch64_neon_smaxv:
1575   case Intrinsic::aarch64_neon_uminv:
1576   case Intrinsic::aarch64_neon_sminv: {
1577     MachineIRBuilder MIB(MI);
1578     MachineRegisterInfo &MRI = *MIB.getMRI();
1579     bool IsSigned = IntrinsicID == Intrinsic::aarch64_neon_saddv ||
1580                     IntrinsicID == Intrinsic::aarch64_neon_smaxv ||
1581                     IntrinsicID == Intrinsic::aarch64_neon_sminv;
1582 
1583     auto OldDst = MI.getOperand(0).getReg();
1584     auto OldDstTy = MRI.getType(OldDst);
1585     LLT NewDstTy = MRI.getType(MI.getOperand(2).getReg()).getElementType();
1586     if (OldDstTy == NewDstTy)
1587       return true;
1588 
1589     auto NewDst = MRI.createGenericVirtualRegister(NewDstTy);
1590 
1591     Helper.Observer.changingInstr(MI);
1592     MI.getOperand(0).setReg(NewDst);
1593     Helper.Observer.changedInstr(MI);
1594 
1595     MIB.setInsertPt(MIB.getMBB(), ++MIB.getInsertPt());
1596     MIB.buildExtOrTrunc(IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT,
1597                         OldDst, NewDst);
1598 
1599     return true;
1600   }
1601   case Intrinsic::aarch64_neon_uaddlp:
1602   case Intrinsic::aarch64_neon_saddlp: {
1603     MachineIRBuilder MIB(MI);
1604 
1605     unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlp
1606                        ? AArch64::G_UADDLP
1607                        : AArch64::G_SADDLP;
1608     MIB.buildInstr(Opc, {MI.getOperand(0)}, {MI.getOperand(2)});
1609     MI.eraseFromParent();
1610 
1611     return true;
1612   }
1613   case Intrinsic::aarch64_neon_uaddlv:
1614   case Intrinsic::aarch64_neon_saddlv: {
1615     MachineIRBuilder MIB(MI);
1616     MachineRegisterInfo &MRI = *MIB.getMRI();
1617 
1618     unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlv
1619                        ? AArch64::G_UADDLV
1620                        : AArch64::G_SADDLV;
1621     Register DstReg = MI.getOperand(0).getReg();
1622     Register SrcReg = MI.getOperand(2).getReg();
1623     LLT DstTy = MRI.getType(DstReg);
1624 
1625     LLT MidTy, ExtTy;
1626     if (DstTy.isScalar() && DstTy.getScalarSizeInBits() <= 32) {
1627       MidTy = LLT::fixed_vector(4, 32);
1628       ExtTy = LLT::scalar(32);
1629     } else {
1630       MidTy = LLT::fixed_vector(2, 64);
1631       ExtTy = LLT::scalar(64);
1632     }
1633 
1634     Register MidReg =
1635         MIB.buildInstr(Opc, {MidTy}, {SrcReg})->getOperand(0).getReg();
1636     Register ZeroReg =
1637         MIB.buildConstant(LLT::scalar(64), 0)->getOperand(0).getReg();
1638     Register ExtReg = MIB.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT, {ExtTy},
1639                                      {MidReg, ZeroReg})
1640                           .getReg(0);
1641 
1642     if (DstTy.getScalarSizeInBits() < 32)
1643       MIB.buildTrunc(DstReg, ExtReg);
1644     else
1645       MIB.buildCopy(DstReg, ExtReg);
1646 
1647     MI.eraseFromParent();
1648 
1649     return true;
1650   }
1651   case Intrinsic::aarch64_neon_smax:
1652   case Intrinsic::aarch64_neon_smin:
1653   case Intrinsic::aarch64_neon_umax:
1654   case Intrinsic::aarch64_neon_umin:
1655   case Intrinsic::aarch64_neon_fmax:
1656   case Intrinsic::aarch64_neon_fmin:
1657   case Intrinsic::aarch64_neon_fmaxnm:
1658   case Intrinsic::aarch64_neon_fminnm: {
1659     MachineIRBuilder MIB(MI);
1660     if (IntrinsicID == Intrinsic::aarch64_neon_smax)
1661       MIB.buildSMax(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3));
1662     else if (IntrinsicID == Intrinsic::aarch64_neon_smin)
1663       MIB.buildSMin(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3));
1664     else if (IntrinsicID == Intrinsic::aarch64_neon_umax)
1665       MIB.buildUMax(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3));
1666     else if (IntrinsicID == Intrinsic::aarch64_neon_umin)
1667       MIB.buildUMin(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3));
1668     else if (IntrinsicID == Intrinsic::aarch64_neon_fmax)
1669       MIB.buildInstr(TargetOpcode::G_FMAXIMUM, {MI.getOperand(0)},
1670                      {MI.getOperand(2), MI.getOperand(3)});
1671     else if (IntrinsicID == Intrinsic::aarch64_neon_fmin)
1672       MIB.buildInstr(TargetOpcode::G_FMINIMUM, {MI.getOperand(0)},
1673                      {MI.getOperand(2), MI.getOperand(3)});
1674     else if (IntrinsicID == Intrinsic::aarch64_neon_fmaxnm)
1675       MIB.buildInstr(TargetOpcode::G_FMAXNUM, {MI.getOperand(0)},
1676                      {MI.getOperand(2), MI.getOperand(3)});
1677     else if (IntrinsicID == Intrinsic::aarch64_neon_fminnm)
1678       MIB.buildInstr(TargetOpcode::G_FMINNUM, {MI.getOperand(0)},
1679                      {MI.getOperand(2), MI.getOperand(3)});
1680     MI.eraseFromParent();
1681     return true;
1682   }
1683   case Intrinsic::vector_reverse:
1684     // TODO: Add support for vector_reverse
1685     return false;
1686   }
1687 
1688   return true;
1689 }
1690 
1691 bool AArch64LegalizerInfo::legalizeShlAshrLshr(
1692     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
1693     GISelChangeObserver &Observer) const {
1694   assert(MI.getOpcode() == TargetOpcode::G_ASHR ||
1695          MI.getOpcode() == TargetOpcode::G_LSHR ||
1696          MI.getOpcode() == TargetOpcode::G_SHL);
1697   // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the
1698   // imported patterns can select it later. Either way, it will be legal.
1699   Register AmtReg = MI.getOperand(2).getReg();
1700   auto VRegAndVal = getIConstantVRegValWithLookThrough(AmtReg, MRI);
1701   if (!VRegAndVal)
1702     return true;
1703   // Check the shift amount is in range for an immediate form.
1704   int64_t Amount = VRegAndVal->Value.getSExtValue();
1705   if (Amount > 31)
1706     return true; // This will have to remain a register variant.
1707   auto ExtCst = MIRBuilder.buildConstant(LLT::scalar(64), Amount);
1708   Observer.changingInstr(MI);
1709   MI.getOperand(2).setReg(ExtCst.getReg(0));
1710   Observer.changedInstr(MI);
1711   return true;
1712 }
1713 
1714 static void matchLDPSTPAddrMode(Register Root, Register &Base, int &Offset,
1715                                 MachineRegisterInfo &MRI) {
1716   Base = Root;
1717   Offset = 0;
1718 
1719   Register NewBase;
1720   int64_t NewOffset;
1721   if (mi_match(Root, MRI, m_GPtrAdd(m_Reg(NewBase), m_ICst(NewOffset))) &&
1722       isShiftedInt<7, 3>(NewOffset)) {
1723     Base = NewBase;
1724     Offset = NewOffset;
1725   }
1726 }
1727 
1728 // FIXME: This should be removed and replaced with the generic bitcast legalize
1729 // action.
1730 bool AArch64LegalizerInfo::legalizeLoadStore(
1731     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
1732     GISelChangeObserver &Observer) const {
1733   assert(MI.getOpcode() == TargetOpcode::G_STORE ||
1734          MI.getOpcode() == TargetOpcode::G_LOAD);
1735   // Here we just try to handle vector loads/stores where our value type might
1736   // have pointer elements, which the SelectionDAG importer can't handle. To
1737   // allow the existing patterns for s64 to fire for p0, we just try to bitcast
1738   // the value to use s64 types.
1739 
1740   // Custom legalization requires the instruction, if not deleted, must be fully
1741   // legalized. In order to allow further legalization of the inst, we create
1742   // a new instruction and erase the existing one.
1743 
1744   Register ValReg = MI.getOperand(0).getReg();
1745   const LLT ValTy = MRI.getType(ValReg);
1746 
1747   if (ValTy == LLT::scalar(128)) {
1748 
1749     AtomicOrdering Ordering = (*MI.memoperands_begin())->getSuccessOrdering();
1750     bool IsLoad = MI.getOpcode() == TargetOpcode::G_LOAD;
1751     bool IsLoadAcquire = IsLoad && Ordering == AtomicOrdering::Acquire;
1752     bool IsStoreRelease = !IsLoad && Ordering == AtomicOrdering::Release;
1753     bool IsRcpC3 =
1754         ST->hasLSE2() && ST->hasRCPC3() && (IsLoadAcquire || IsStoreRelease);
1755 
1756     LLT s64 = LLT::scalar(64);
1757 
1758     unsigned Opcode;
1759     if (IsRcpC3) {
1760       Opcode = IsLoad ? AArch64::LDIAPPX : AArch64::STILPX;
1761     } else {
1762       // For LSE2, loads/stores should have been converted to monotonic and had
1763       // a fence inserted after them.
1764       assert(Ordering == AtomicOrdering::Monotonic ||
1765              Ordering == AtomicOrdering::Unordered);
1766       assert(ST->hasLSE2() && "ldp/stp not single copy atomic without +lse2");
1767 
1768       Opcode = IsLoad ? AArch64::LDPXi : AArch64::STPXi;
1769     }
1770 
1771     MachineInstrBuilder NewI;
1772     if (IsLoad) {
1773       NewI = MIRBuilder.buildInstr(Opcode, {s64, s64}, {});
1774       MIRBuilder.buildMergeLikeInstr(
1775           ValReg, {NewI->getOperand(0), NewI->getOperand(1)});
1776     } else {
1777       auto Split = MIRBuilder.buildUnmerge(s64, MI.getOperand(0));
1778       NewI = MIRBuilder.buildInstr(
1779           Opcode, {}, {Split->getOperand(0), Split->getOperand(1)});
1780     }
1781 
1782     if (IsRcpC3) {
1783       NewI.addUse(MI.getOperand(1).getReg());
1784     } else {
1785       Register Base;
1786       int Offset;
1787       matchLDPSTPAddrMode(MI.getOperand(1).getReg(), Base, Offset, MRI);
1788       NewI.addUse(Base);
1789       NewI.addImm(Offset / 8);
1790     }
1791 
1792     NewI.cloneMemRefs(MI);
1793     constrainSelectedInstRegOperands(*NewI, *ST->getInstrInfo(),
1794                                      *MRI.getTargetRegisterInfo(),
1795                                      *ST->getRegBankInfo());
1796     MI.eraseFromParent();
1797     return true;
1798   }
1799 
1800   if (!ValTy.isPointerVector() ||
1801       ValTy.getElementType().getAddressSpace() != 0) {
1802     LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store");
1803     return false;
1804   }
1805 
1806   unsigned PtrSize = ValTy.getElementType().getSizeInBits();
1807   const LLT NewTy = LLT::vector(ValTy.getElementCount(), PtrSize);
1808   auto &MMO = **MI.memoperands_begin();
1809   MMO.setType(NewTy);
1810 
1811   if (MI.getOpcode() == TargetOpcode::G_STORE) {
1812     auto Bitcast = MIRBuilder.buildBitcast(NewTy, ValReg);
1813     MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1), MMO);
1814   } else {
1815     auto NewLoad = MIRBuilder.buildLoad(NewTy, MI.getOperand(1), MMO);
1816     MIRBuilder.buildBitcast(ValReg, NewLoad);
1817   }
1818   MI.eraseFromParent();
1819   return true;
1820 }
1821 
1822 bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
1823                                          MachineRegisterInfo &MRI,
1824                                          MachineIRBuilder &MIRBuilder) const {
1825   MachineFunction &MF = MIRBuilder.getMF();
1826   Align Alignment(MI.getOperand(2).getImm());
1827   Register Dst = MI.getOperand(0).getReg();
1828   Register ListPtr = MI.getOperand(1).getReg();
1829 
1830   LLT PtrTy = MRI.getType(ListPtr);
1831   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1832 
1833   const unsigned PtrSize = PtrTy.getSizeInBits() / 8;
1834   const Align PtrAlign = Align(PtrSize);
1835   auto List = MIRBuilder.buildLoad(
1836       PtrTy, ListPtr,
1837       *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
1838                                PtrTy, PtrAlign));
1839 
1840   MachineInstrBuilder DstPtr;
1841   if (Alignment > PtrAlign) {
1842     // Realign the list to the actual required alignment.
1843     auto AlignMinus1 =
1844         MIRBuilder.buildConstant(IntPtrTy, Alignment.value() - 1);
1845     auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0));
1846     DstPtr = MIRBuilder.buildMaskLowPtrBits(PtrTy, ListTmp, Log2(Alignment));
1847   } else
1848     DstPtr = List;
1849 
1850   LLT ValTy = MRI.getType(Dst);
1851   uint64_t ValSize = ValTy.getSizeInBits() / 8;
1852   MIRBuilder.buildLoad(
1853       Dst, DstPtr,
1854       *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
1855                                ValTy, std::max(Alignment, PtrAlign)));
1856 
1857   auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrAlign));
1858 
1859   auto NewList = MIRBuilder.buildPtrAdd(PtrTy, DstPtr, Size.getReg(0));
1860 
1861   MIRBuilder.buildStore(NewList, ListPtr,
1862                         *MF.getMachineMemOperand(MachinePointerInfo(),
1863                                                  MachineMemOperand::MOStore,
1864                                                  PtrTy, PtrAlign));
1865 
1866   MI.eraseFromParent();
1867   return true;
1868 }
1869 
1870 bool AArch64LegalizerInfo::legalizeBitfieldExtract(
1871     MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
1872   // Only legal if we can select immediate forms.
1873   // TODO: Lower this otherwise.
1874   return getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI) &&
1875          getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
1876 }
1877 
1878 bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI,
1879                                          MachineRegisterInfo &MRI,
1880                                          LegalizerHelper &Helper) const {
1881   // When there is no integer popcount instruction (FEAT_CSSC isn't available),
1882   // it can be more efficiently lowered to the following sequence that uses
1883   // AdvSIMD registers/instructions as long as the copies to/from the AdvSIMD
1884   // registers are cheap.
1885   //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
1886   //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
1887   //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
1888   //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
1889   //
1890   // For 128 bit vector popcounts, we lower to the following sequence:
1891   //  cnt.16b   v0, v0  // v8s16, v4s32, v2s64
1892   //  uaddlp.8h v0, v0  // v8s16, v4s32, v2s64
1893   //  uaddlp.4s v0, v0  //        v4s32, v2s64
1894   //  uaddlp.2d v0, v0  //               v2s64
1895   //
1896   // For 64 bit vector popcounts, we lower to the following sequence:
1897   //  cnt.8b    v0, v0  // v4s16, v2s32
1898   //  uaddlp.4h v0, v0  // v4s16, v2s32
1899   //  uaddlp.2s v0, v0  //        v2s32
1900 
1901   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1902   Register Dst = MI.getOperand(0).getReg();
1903   Register Val = MI.getOperand(1).getReg();
1904   LLT Ty = MRI.getType(Val);
1905   unsigned Size = Ty.getSizeInBits();
1906 
1907   assert(Ty == MRI.getType(Dst) &&
1908          "Expected src and dst to have the same type!");
1909 
1910   if (ST->hasCSSC() && Ty.isScalar() && Size == 128) {
1911     LLT s64 = LLT::scalar(64);
1912 
1913     auto Split = MIRBuilder.buildUnmerge(s64, Val);
1914     auto CTPOP1 = MIRBuilder.buildCTPOP(s64, Split->getOperand(0));
1915     auto CTPOP2 = MIRBuilder.buildCTPOP(s64, Split->getOperand(1));
1916     auto Add = MIRBuilder.buildAdd(s64, CTPOP1, CTPOP2);
1917 
1918     MIRBuilder.buildZExt(Dst, Add);
1919     MI.eraseFromParent();
1920     return true;
1921   }
1922 
1923   if (!ST->hasNEON() ||
1924       MI.getMF()->getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) {
1925     // Use generic lowering when custom lowering is not possible.
1926     return Ty.isScalar() && (Size == 32 || Size == 64) &&
1927            Helper.lowerBitCount(MI) ==
1928                LegalizerHelper::LegalizeResult::Legalized;
1929   }
1930 
1931   // Pre-conditioning: widen Val up to the nearest vector type.
1932   // s32,s64,v4s16,v2s32 -> v8i8
1933   // v8s16,v4s32,v2s64 -> v16i8
1934   LLT VTy = Size == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8);
1935   if (Ty.isScalar()) {
1936     assert((Size == 32 || Size == 64 || Size == 128) && "Expected only 32, 64, or 128 bit scalars!");
1937     if (Size == 32) {
1938       Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0);
1939     }
1940   }
1941   Val = MIRBuilder.buildBitcast(VTy, Val).getReg(0);
1942 
1943   // Count bits in each byte-sized lane.
1944   auto CTPOP = MIRBuilder.buildCTPOP(VTy, Val);
1945 
1946   // Sum across lanes.
1947 
1948   if (ST->hasDotProd() && Ty.isVector() && Ty.getNumElements() >= 2 &&
1949       Ty.getScalarSizeInBits() != 16) {
1950     LLT Dt = Ty == LLT::fixed_vector(2, 64) ? LLT::fixed_vector(4, 32) : Ty;
1951     auto Zeros = MIRBuilder.buildConstant(Dt, 0);
1952     auto Ones = MIRBuilder.buildConstant(VTy, 1);
1953     MachineInstrBuilder Sum;
1954 
1955     if (Ty == LLT::fixed_vector(2, 64)) {
1956       auto UDOT =
1957           MIRBuilder.buildInstr(AArch64::G_UDOT, {Dt}, {Zeros, Ones, CTPOP});
1958       Sum = MIRBuilder.buildInstr(AArch64::G_UADDLP, {Ty}, {UDOT});
1959     } else if (Ty == LLT::fixed_vector(4, 32)) {
1960       Sum = MIRBuilder.buildInstr(AArch64::G_UDOT, {Dt}, {Zeros, Ones, CTPOP});
1961     } else if (Ty == LLT::fixed_vector(2, 32)) {
1962       Sum = MIRBuilder.buildInstr(AArch64::G_UDOT, {Dt}, {Zeros, Ones, CTPOP});
1963     } else {
1964       llvm_unreachable("unexpected vector shape");
1965     }
1966 
1967     Sum->getOperand(0).setReg(Dst);
1968     MI.eraseFromParent();
1969     return true;
1970   }
1971 
1972   Register HSum = CTPOP.getReg(0);
1973   unsigned Opc;
1974   SmallVector<LLT> HAddTys;
1975   if (Ty.isScalar()) {
1976     Opc = Intrinsic::aarch64_neon_uaddlv;
1977     HAddTys.push_back(LLT::scalar(32));
1978   } else if (Ty == LLT::fixed_vector(8, 16)) {
1979     Opc = Intrinsic::aarch64_neon_uaddlp;
1980     HAddTys.push_back(LLT::fixed_vector(8, 16));
1981   } else if (Ty == LLT::fixed_vector(4, 32)) {
1982     Opc = Intrinsic::aarch64_neon_uaddlp;
1983     HAddTys.push_back(LLT::fixed_vector(8, 16));
1984     HAddTys.push_back(LLT::fixed_vector(4, 32));
1985   } else if (Ty == LLT::fixed_vector(2, 64)) {
1986     Opc = Intrinsic::aarch64_neon_uaddlp;
1987     HAddTys.push_back(LLT::fixed_vector(8, 16));
1988     HAddTys.push_back(LLT::fixed_vector(4, 32));
1989     HAddTys.push_back(LLT::fixed_vector(2, 64));
1990   } else if (Ty == LLT::fixed_vector(4, 16)) {
1991     Opc = Intrinsic::aarch64_neon_uaddlp;
1992     HAddTys.push_back(LLT::fixed_vector(4, 16));
1993   } else if (Ty == LLT::fixed_vector(2, 32)) {
1994     Opc = Intrinsic::aarch64_neon_uaddlp;
1995     HAddTys.push_back(LLT::fixed_vector(4, 16));
1996     HAddTys.push_back(LLT::fixed_vector(2, 32));
1997   } else
1998     llvm_unreachable("unexpected vector shape");
1999   MachineInstrBuilder UADD;
2000   for (LLT HTy : HAddTys) {
2001     UADD = MIRBuilder.buildIntrinsic(Opc, {HTy}).addUse(HSum);
2002     HSum = UADD.getReg(0);
2003   }
2004 
2005   // Post-conditioning.
2006   if (Ty.isScalar() && (Size == 64 || Size == 128))
2007     MIRBuilder.buildZExt(Dst, UADD);
2008   else
2009     UADD->getOperand(0).setReg(Dst);
2010   MI.eraseFromParent();
2011   return true;
2012 }
2013 
2014 bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128(
2015     MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
2016   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2017   LLT s64 = LLT::scalar(64);
2018   auto Addr = MI.getOperand(1).getReg();
2019   auto DesiredI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(2));
2020   auto NewI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(3));
2021   auto DstLo = MRI.createGenericVirtualRegister(s64);
2022   auto DstHi = MRI.createGenericVirtualRegister(s64);
2023 
2024   MachineInstrBuilder CAS;
2025   if (ST->hasLSE()) {
2026     // We have 128-bit CASP instructions taking XSeqPair registers, which are
2027     // s128. We need the merge/unmerge to bracket the expansion and pair up with
2028     // the rest of the MIR so we must reassemble the extracted registers into a
2029     // 128-bit known-regclass one with code like this:
2030     //
2031     //     %in1 = REG_SEQUENCE Lo, Hi    ; One for each input
2032     //     %out = CASP %in1, ...
2033     //     %OldLo = G_EXTRACT %out, 0
2034     //     %OldHi = G_EXTRACT %out, 64
2035     auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
2036     unsigned Opcode;
2037     switch (Ordering) {
2038     case AtomicOrdering::Acquire:
2039       Opcode = AArch64::CASPAX;
2040       break;
2041     case AtomicOrdering::Release:
2042       Opcode = AArch64::CASPLX;
2043       break;
2044     case AtomicOrdering::AcquireRelease:
2045     case AtomicOrdering::SequentiallyConsistent:
2046       Opcode = AArch64::CASPALX;
2047       break;
2048     default:
2049       Opcode = AArch64::CASPX;
2050       break;
2051     }
2052 
2053     LLT s128 = LLT::scalar(128);
2054     auto CASDst = MRI.createGenericVirtualRegister(s128);
2055     auto CASDesired = MRI.createGenericVirtualRegister(s128);
2056     auto CASNew = MRI.createGenericVirtualRegister(s128);
2057     MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASDesired}, {})
2058         .addUse(DesiredI->getOperand(0).getReg())
2059         .addImm(AArch64::sube64)
2060         .addUse(DesiredI->getOperand(1).getReg())
2061         .addImm(AArch64::subo64);
2062     MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASNew}, {})
2063         .addUse(NewI->getOperand(0).getReg())
2064         .addImm(AArch64::sube64)
2065         .addUse(NewI->getOperand(1).getReg())
2066         .addImm(AArch64::subo64);
2067 
2068     CAS = MIRBuilder.buildInstr(Opcode, {CASDst}, {CASDesired, CASNew, Addr});
2069 
2070     MIRBuilder.buildExtract({DstLo}, {CASDst}, 0);
2071     MIRBuilder.buildExtract({DstHi}, {CASDst}, 64);
2072   } else {
2073     // The -O0 CMP_SWAP_128 is friendlier to generate code for because LDXP/STXP
2074     // can take arbitrary registers so it just has the normal GPR64 operands the
2075     // rest of AArch64 is expecting.
2076     auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
2077     unsigned Opcode;
2078     switch (Ordering) {
2079     case AtomicOrdering::Acquire:
2080       Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
2081       break;
2082     case AtomicOrdering::Release:
2083       Opcode = AArch64::CMP_SWAP_128_RELEASE;
2084       break;
2085     case AtomicOrdering::AcquireRelease:
2086     case AtomicOrdering::SequentiallyConsistent:
2087       Opcode = AArch64::CMP_SWAP_128;
2088       break;
2089     default:
2090       Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
2091       break;
2092     }
2093 
2094     auto Scratch = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
2095     CAS = MIRBuilder.buildInstr(Opcode, {DstLo, DstHi, Scratch},
2096                                 {Addr, DesiredI->getOperand(0),
2097                                  DesiredI->getOperand(1), NewI->getOperand(0),
2098                                  NewI->getOperand(1)});
2099   }
2100 
2101   CAS.cloneMemRefs(MI);
2102   constrainSelectedInstRegOperands(*CAS, *ST->getInstrInfo(),
2103                                    *MRI.getTargetRegisterInfo(),
2104                                    *ST->getRegBankInfo());
2105 
2106   MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {DstLo, DstHi});
2107   MI.eraseFromParent();
2108   return true;
2109 }
2110 
2111 bool AArch64LegalizerInfo::legalizeCTTZ(MachineInstr &MI,
2112                                         LegalizerHelper &Helper) const {
2113   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2114   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
2115   LLT Ty = MRI.getType(MI.getOperand(1).getReg());
2116   auto BitReverse = MIRBuilder.buildBitReverse(Ty, MI.getOperand(1));
2117   MIRBuilder.buildCTLZ(MI.getOperand(0).getReg(), BitReverse);
2118   MI.eraseFromParent();
2119   return true;
2120 }
2121 
2122 bool AArch64LegalizerInfo::legalizeMemOps(MachineInstr &MI,
2123                                           LegalizerHelper &Helper) const {
2124   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2125 
2126   // Tagged version MOPSMemorySetTagged is legalised in legalizeIntrinsic
2127   if (MI.getOpcode() == TargetOpcode::G_MEMSET) {
2128     // Anyext the value being set to 64 bit (only the bottom 8 bits are read by
2129     // the instruction).
2130     auto &Value = MI.getOperand(1);
2131     Register ExtValueReg =
2132         MIRBuilder.buildAnyExt(LLT::scalar(64), Value).getReg(0);
2133     Value.setReg(ExtValueReg);
2134     return true;
2135   }
2136 
2137   return false;
2138 }
2139 
2140 bool AArch64LegalizerInfo::legalizeExtractVectorElt(
2141     MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
2142   assert(MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT);
2143   auto VRegAndVal =
2144       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2145   if (VRegAndVal)
2146     return true;
2147   return Helper.lowerExtractInsertVectorElt(MI) !=
2148          LegalizerHelper::LegalizeResult::UnableToLegalize;
2149 }
2150 
2151 bool AArch64LegalizerInfo::legalizeDynStackAlloc(
2152     MachineInstr &MI, LegalizerHelper &Helper) const {
2153   MachineFunction &MF = *MI.getParent()->getParent();
2154   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2155   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
2156 
2157   // If stack probing is not enabled for this function, use the default
2158   // lowering.
2159   if (!MF.getFunction().hasFnAttribute("probe-stack") ||
2160       MF.getFunction().getFnAttribute("probe-stack").getValueAsString() !=
2161           "inline-asm") {
2162     Helper.lowerDynStackAlloc(MI);
2163     return true;
2164   }
2165 
2166   Register Dst = MI.getOperand(0).getReg();
2167   Register AllocSize = MI.getOperand(1).getReg();
2168   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
2169 
2170   assert(MRI.getType(Dst) == LLT::pointer(0, 64) &&
2171          "Unexpected type for dynamic alloca");
2172   assert(MRI.getType(AllocSize) == LLT::scalar(64) &&
2173          "Unexpected type for dynamic alloca");
2174 
2175   LLT PtrTy = MRI.getType(Dst);
2176   Register SPReg =
2177       Helper.getTargetLowering().getStackPointerRegisterToSaveRestore();
2178   Register SPTmp =
2179       Helper.getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy);
2180   auto NewMI =
2181       MIRBuilder.buildInstr(AArch64::PROBED_STACKALLOC_DYN, {}, {SPTmp});
2182   MRI.setRegClass(NewMI.getReg(0), &AArch64::GPR64commonRegClass);
2183   MIRBuilder.setInsertPt(*NewMI->getParent(), NewMI);
2184   MIRBuilder.buildCopy(Dst, SPTmp);
2185 
2186   MI.eraseFromParent();
2187   return true;
2188 }
2189 
2190 bool AArch64LegalizerInfo::legalizePrefetch(MachineInstr &MI,
2191                                             LegalizerHelper &Helper) const {
2192   MachineIRBuilder &MIB = Helper.MIRBuilder;
2193   auto &AddrVal = MI.getOperand(0);
2194 
2195   int64_t IsWrite = MI.getOperand(1).getImm();
2196   int64_t Locality = MI.getOperand(2).getImm();
2197   int64_t IsData = MI.getOperand(3).getImm();
2198 
2199   bool IsStream = Locality == 0;
2200   if (Locality != 0) {
2201     assert(Locality <= 3 && "Prefetch locality out-of-range");
2202     // The locality degree is the opposite of the cache speed.
2203     // Put the number the other way around.
2204     // The encoding starts at 0 for level 1
2205     Locality = 3 - Locality;
2206   }
2207 
2208   unsigned PrfOp = (IsWrite << 4) | (!IsData << 3) | (Locality << 1) | IsStream;
2209 
2210   MIB.buildInstr(AArch64::G_AARCH64_PREFETCH).addImm(PrfOp).add(AddrVal);
2211   MI.eraseFromParent();
2212   return true;
2213 }
2214