xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp (revision 258a0d760aa8b42899a000e30f610f900a402556)
1 //===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AArch64.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AArch64LegalizerInfo.h"
15 #include "AArch64RegisterBankInfo.h"
16 #include "AArch64Subtarget.h"
17 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
18 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
19 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
20 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
21 #include "llvm/CodeGen/GlobalISel/Utils.h"
22 #include "llvm/CodeGen/MachineInstr.h"
23 #include "llvm/CodeGen/MachineRegisterInfo.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/Intrinsics.h"
28 #include "llvm/IR/IntrinsicsAArch64.h"
29 #include "llvm/IR/Type.h"
30 #include "llvm/Support/MathExtras.h"
31 #include <initializer_list>
32 
33 #define DEBUG_TYPE "aarch64-legalinfo"
34 
35 using namespace llvm;
36 using namespace LegalizeActions;
37 using namespace LegalizeMutations;
38 using namespace LegalityPredicates;
39 using namespace MIPatternMatch;
40 
41 AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
42     : ST(&ST) {
43   using namespace TargetOpcode;
44   const LLT p0 = LLT::pointer(0, 64);
45   const LLT s8 = LLT::scalar(8);
46   const LLT s16 = LLT::scalar(16);
47   const LLT s32 = LLT::scalar(32);
48   const LLT s64 = LLT::scalar(64);
49   const LLT s128 = LLT::scalar(128);
50   const LLT v16s8 = LLT::fixed_vector(16, 8);
51   const LLT v8s8 = LLT::fixed_vector(8, 8);
52   const LLT v4s8 = LLT::fixed_vector(4, 8);
53   const LLT v8s16 = LLT::fixed_vector(8, 16);
54   const LLT v4s16 = LLT::fixed_vector(4, 16);
55   const LLT v2s16 = LLT::fixed_vector(2, 16);
56   const LLT v2s32 = LLT::fixed_vector(2, 32);
57   const LLT v4s32 = LLT::fixed_vector(4, 32);
58   const LLT v2s64 = LLT::fixed_vector(2, 64);
59   const LLT v2p0 = LLT::fixed_vector(2, p0);
60 
61   std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */
62                                                         v16s8, v8s16, v4s32,
63                                                         v2s64, v2p0,
64                                                         /* End 128bit types */
65                                                         /* Begin 64bit types */
66                                                         v8s8, v4s16, v2s32};
67 
68   const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine();
69 
70   // FIXME: support subtargets which have neon/fp-armv8 disabled.
71   if (!ST.hasNEON() || !ST.hasFPARMv8()) {
72     getLegacyLegalizerInfo().computeTables();
73     return;
74   }
75 
76   // Some instructions only support s16 if the subtarget has full 16-bit FP
77   // support.
78   const bool HasFP16 = ST.hasFullFP16();
79   const LLT &MinFPScalar = HasFP16 ? s16 : s32;
80 
81   const bool HasCSSC = ST.hasCSSC();
82 
83   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
84       .legalFor({p0, s8, s16, s32, s64})
85       .legalFor(PackedVectorAllTypeList)
86       .widenScalarToNextPow2(0)
87       .clampScalar(0, s8, s64)
88       .fewerElementsIf(
89           [=](const LegalityQuery &Query) {
90             return Query.Types[0].isVector() &&
91                    (Query.Types[0].getElementType() != s64 ||
92                     Query.Types[0].getNumElements() != 2);
93           },
94           [=](const LegalityQuery &Query) {
95             LLT EltTy = Query.Types[0].getElementType();
96             if (EltTy == s64)
97               return std::make_pair(0, LLT::fixed_vector(2, 64));
98             return std::make_pair(0, EltTy);
99           });
100 
101   getActionDefinitionsBuilder(G_PHI)
102       .legalFor({p0, s16, s32, s64})
103       .legalFor(PackedVectorAllTypeList)
104       .widenScalarToNextPow2(0)
105       .clampScalar(0, s16, s64)
106       // Maximum: sN * k = 128
107       .clampMaxNumElements(0, s8, 16)
108       .clampMaxNumElements(0, s16, 8)
109       .clampMaxNumElements(0, s32, 4)
110       .clampMaxNumElements(0, s64, 2)
111       .clampMaxNumElements(0, p0, 2);
112 
113   getActionDefinitionsBuilder(G_BSWAP)
114       .legalFor({s32, s64, v4s32, v2s32, v2s64})
115       .widenScalarToNextPow2(0)
116       .clampScalar(0, s32, s64);
117 
118   getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
119       .legalFor({s32, s64, v2s32, v4s32, v4s16, v8s16, v16s8, v8s8})
120       .scalarizeIf(
121           [=](const LegalityQuery &Query) {
122             return Query.Opcode == G_MUL && Query.Types[0] == v2s64;
123           },
124           0)
125       .legalFor({v2s64})
126       .widenScalarToNextPow2(0)
127       .clampScalar(0, s32, s64)
128       .clampNumElements(0, v2s32, v4s32)
129       .clampNumElements(0, v2s64, v2s64)
130       .moreElementsToNextPow2(0);
131 
132   getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR})
133       .customIf([=](const LegalityQuery &Query) {
134         const auto &SrcTy = Query.Types[0];
135         const auto &AmtTy = Query.Types[1];
136         return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
137                AmtTy.getSizeInBits() == 32;
138       })
139       .legalFor({
140           {s32, s32},
141           {s32, s64},
142           {s64, s64},
143           {v8s8, v8s8},
144           {v16s8, v16s8},
145           {v4s16, v4s16},
146           {v8s16, v8s16},
147           {v2s32, v2s32},
148           {v4s32, v4s32},
149           {v2s64, v2s64},
150       })
151       .widenScalarToNextPow2(0)
152       .clampScalar(1, s32, s64)
153       .clampScalar(0, s32, s64)
154       .clampNumElements(0, v2s32, v4s32)
155       .clampNumElements(0, v2s64, v2s64)
156       .moreElementsToNextPow2(0)
157       .minScalarSameAs(1, 0);
158 
159   getActionDefinitionsBuilder(G_PTR_ADD)
160       .legalFor({{p0, s64}, {v2p0, v2s64}})
161       .clampScalar(1, s64, s64);
162 
163   getActionDefinitionsBuilder(G_PTRMASK).legalFor({{p0, s64}});
164 
165   getActionDefinitionsBuilder({G_SDIV, G_UDIV})
166       .legalFor({s32, s64})
167       .libcallFor({s128})
168       .clampScalar(0, s32, s64)
169       .widenScalarToNextPow2(0)
170       .scalarize(0);
171 
172   getActionDefinitionsBuilder({G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
173       .lowerFor({s8, s16, s32, s64, v2s64, v4s32, v2s32})
174       .widenScalarOrEltToNextPow2(0)
175       .clampScalarOrElt(0, s32, s64)
176       .clampNumElements(0, v2s32, v4s32)
177       .clampNumElements(0, v2s64, v2s64)
178       .moreElementsToNextPow2(0);
179 
180 
181   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
182       .widenScalarToNextPow2(0, /*Min = */ 32)
183       .clampScalar(0, s32, s64)
184       .lower();
185 
186   getActionDefinitionsBuilder({G_SMULH, G_UMULH})
187       .legalFor({s64, v8s16, v16s8, v4s32})
188       .lower();
189 
190   auto &MinMaxActions = getActionDefinitionsBuilder(
191       {G_SMIN, G_SMAX, G_UMIN, G_UMAX});
192   if (HasCSSC)
193     MinMaxActions
194         .legalFor({s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
195         // Making clamping conditional on CSSC extension as without legal types we
196         // lower to CMP which can fold one of the two sxtb's we'd otherwise need
197         // if we detect a type smaller than 32-bit.
198         .minScalar(0, s32);
199   else
200     MinMaxActions
201         .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32});
202   MinMaxActions
203       .clampNumElements(0, v8s8, v16s8)
204       .clampNumElements(0, v4s16, v8s16)
205       .clampNumElements(0, v2s32, v4s32)
206       // FIXME: This sholdn't be needed as v2s64 types are going to
207       // be expanded anyway, but G_ICMP doesn't support splitting vectors yet
208       .clampNumElements(0, v2s64, v2s64)
209       .lower();
210 
211   getActionDefinitionsBuilder(
212       {G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO})
213       .legalFor({{s32, s32}, {s64, s32}})
214       .clampScalar(0, s32, s64)
215        .clampScalar(1, s32, s64)
216       .widenScalarToNextPow2(0);
217 
218   getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FNEG})
219       .legalFor({MinFPScalar, s32, s64, v2s64, v4s32, v2s32})
220       .clampScalar(0, MinFPScalar, s64)
221       .clampNumElements(0, v2s32, v4s32)
222       .clampNumElements(0, v2s64, v2s64);
223 
224   getActionDefinitionsBuilder(G_FREM).libcallFor({s32, s64});
225 
226   getActionDefinitionsBuilder({G_FCEIL, G_FABS, G_FSQRT, G_FFLOOR, G_FRINT,
227                                G_FMA, G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND,
228                                G_FNEARBYINT, G_INTRINSIC_LRINT})
229       // If we don't have full FP16 support, then scalarize the elements of
230       // vectors containing fp16 types.
231       .fewerElementsIf(
232           [=, &ST](const LegalityQuery &Query) {
233             const auto &Ty = Query.Types[0];
234             return Ty.isVector() && Ty.getElementType() == s16 &&
235                    !ST.hasFullFP16();
236           },
237           [=](const LegalityQuery &Query) { return std::make_pair(0, s16); })
238       // If we don't have full FP16 support, then widen s16 to s32 if we
239       // encounter it.
240       .widenScalarIf(
241           [=, &ST](const LegalityQuery &Query) {
242             return Query.Types[0] == s16 && !ST.hasFullFP16();
243           },
244           [=](const LegalityQuery &Query) { return std::make_pair(0, s32); })
245       .legalFor({s16, s32, s64, v2s32, v4s32, v2s64, v2s16, v4s16, v8s16});
246 
247   getActionDefinitionsBuilder(
248       {G_FCOS, G_FSIN, G_FLOG10, G_FLOG, G_FLOG2, G_FEXP, G_FEXP2, G_FPOW})
249       // We need a call for these, so we always need to scalarize.
250       .scalarize(0)
251       // Regardless of FP16 support, widen 16-bit elements to 32-bits.
252       .minScalar(0, s32)
253       .libcallFor({s32, s64, v2s32, v4s32, v2s64});
254 
255   getActionDefinitionsBuilder(G_INSERT)
256       .legalIf(all(typeInSet(0, {s32, s64, p0}),
257                    typeInSet(1, {s8, s16, s32}), smallerThan(1, 0)))
258       .widenScalarToNextPow2(0)
259       .clampScalar(0, s32, s64)
260       .widenScalarToNextPow2(1)
261       .minScalar(1, s8)
262       .maxScalarIf(typeInSet(0, {s32}), 1, s16)
263       .maxScalarIf(typeInSet(0, {s64, p0}), 1, s32);
264 
265   getActionDefinitionsBuilder(G_EXTRACT)
266       .legalIf(all(typeInSet(0, {s16, s32, s64, p0}),
267                    typeInSet(1, {s32, s64, s128, p0}), smallerThan(0, 1)))
268       .widenScalarToNextPow2(1)
269       .clampScalar(1, s32, s128)
270       .widenScalarToNextPow2(0)
271       .minScalar(0, s16)
272       .maxScalarIf(typeInSet(1, {s32}), 0, s16)
273       .maxScalarIf(typeInSet(1, {s64, p0}), 0, s32)
274       .maxScalarIf(typeInSet(1, {s128}), 0, s64);
275 
276 
277   for (unsigned Op : {G_SEXTLOAD, G_ZEXTLOAD}) {
278     auto &Actions =  getActionDefinitionsBuilder(Op);
279 
280     if (Op == G_SEXTLOAD)
281       Actions.lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered));
282 
283     // Atomics have zero extending behavior.
284     Actions
285       .legalForTypesWithMemDesc({{s32, p0, s8, 8},
286                                  {s32, p0, s16, 8},
287                                  {s32, p0, s32, 8},
288                                  {s64, p0, s8, 2},
289                                  {s64, p0, s16, 2},
290                                  {s64, p0, s32, 4},
291                                  {s64, p0, s64, 8},
292                                  {p0, p0, s64, 8},
293                                  {v2s32, p0, s64, 8}})
294       .widenScalarToNextPow2(0)
295       .clampScalar(0, s32, s64)
296       // TODO: We could support sum-of-pow2's but the lowering code doesn't know
297       //       how to do that yet.
298       .unsupportedIfMemSizeNotPow2()
299       // Lower anything left over into G_*EXT and G_LOAD
300       .lower();
301   }
302 
303   auto IsPtrVecPred = [=](const LegalityQuery &Query) {
304     const LLT &ValTy = Query.Types[0];
305     if (!ValTy.isVector())
306       return false;
307     const LLT EltTy = ValTy.getElementType();
308     return EltTy.isPointer() && EltTy.getAddressSpace() == 0;
309   };
310 
311   getActionDefinitionsBuilder(G_LOAD)
312       .customIf([=](const LegalityQuery &Query) {
313         return Query.Types[0] == s128 &&
314                Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
315       })
316       .legalForTypesWithMemDesc({{s8, p0, s8, 8},
317                                  {s16, p0, s16, 8},
318                                  {s32, p0, s32, 8},
319                                  {s64, p0, s64, 8},
320                                  {p0, p0, s64, 8},
321                                  {s128, p0, s128, 8},
322                                  {v8s8, p0, s64, 8},
323                                  {v16s8, p0, s128, 8},
324                                  {v4s16, p0, s64, 8},
325                                  {v8s16, p0, s128, 8},
326                                  {v2s32, p0, s64, 8},
327                                  {v4s32, p0, s128, 8},
328                                  {v2s64, p0, s128, 8}})
329       // These extends are also legal
330       .legalForTypesWithMemDesc({{s32, p0, s8, 8}, {s32, p0, s16, 8}})
331       .widenScalarToNextPow2(0, /* MinSize = */8)
332       .lowerIfMemSizeNotByteSizePow2()
333       .clampScalar(0, s8, s64)
334       .narrowScalarIf([=](const LegalityQuery &Query) {
335         // Clamp extending load results to 32-bits.
336         return Query.Types[0].isScalar() &&
337           Query.Types[0] != Query.MMODescrs[0].MemoryTy &&
338           Query.Types[0].getSizeInBits() > 32;
339         },
340         changeTo(0, s32))
341       .clampMaxNumElements(0, s8, 16)
342       .clampMaxNumElements(0, s16, 8)
343       .clampMaxNumElements(0, s32, 4)
344       .clampMaxNumElements(0, s64, 2)
345       .clampMaxNumElements(0, p0, 2)
346       .customIf(IsPtrVecPred)
347       .scalarizeIf(typeIs(0, v2s16), 0);
348 
349   getActionDefinitionsBuilder(G_STORE)
350       .customIf([=](const LegalityQuery &Query) {
351         return Query.Types[0] == s128 &&
352                Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
353       })
354       .legalForTypesWithMemDesc({{s8, p0, s8, 8},
355                                  {s16, p0, s8, 8}, // truncstorei8 from s16
356                                  {s32, p0, s8, 8}, // truncstorei8 from s32
357                                  {s64, p0, s8, 8}, // truncstorei8 from s64
358                                  {s16, p0, s16, 8},
359                                  {s32, p0, s16, 8}, // truncstorei16 from s32
360                                  {s64, p0, s16, 8}, // truncstorei16 from s64
361                                  {s32, p0, s8, 8},
362                                  {s32, p0, s16, 8},
363                                  {s32, p0, s32, 8},
364                                  {s64, p0, s64, 8},
365                                  {s64, p0, s32, 8}, // truncstorei32 from s64
366                                  {p0, p0, s64, 8},
367                                  {s128, p0, s128, 8},
368                                  {v16s8, p0, s128, 8},
369                                  {v8s8, p0, s64, 8},
370                                  {v4s16, p0, s64, 8},
371                                  {v8s16, p0, s128, 8},
372                                  {v2s32, p0, s64, 8},
373                                  {v4s32, p0, s128, 8},
374                                  {v2s64, p0, s128, 8}})
375       .clampScalar(0, s8, s64)
376       .lowerIf([=](const LegalityQuery &Query) {
377         return Query.Types[0].isScalar() &&
378                Query.Types[0] != Query.MMODescrs[0].MemoryTy;
379       })
380       // Maximum: sN * k = 128
381       .clampMaxNumElements(0, s8, 16)
382       .clampMaxNumElements(0, s16, 8)
383       .clampMaxNumElements(0, s32, 4)
384       .clampMaxNumElements(0, s64, 2)
385       .clampMaxNumElements(0, p0, 2)
386       .lowerIfMemSizeNotPow2()
387       .customIf(IsPtrVecPred)
388       .scalarizeIf(typeIs(0, v2s16), 0);
389 
390   // Constants
391   getActionDefinitionsBuilder(G_CONSTANT)
392       .legalFor({p0, s8, s16, s32, s64})
393       .widenScalarToNextPow2(0)
394       .clampScalar(0, s8, s64);
395   getActionDefinitionsBuilder(G_FCONSTANT)
396       .legalIf([=](const LegalityQuery &Query) {
397         const auto &Ty = Query.Types[0];
398         if (HasFP16 && Ty == s16)
399           return true;
400         return Ty == s32 || Ty == s64 || Ty == s128;
401       })
402       .clampScalar(0, MinFPScalar, s128);
403 
404   getActionDefinitionsBuilder({G_ICMP, G_FCMP})
405       .legalFor({{s32, s32},
406                  {s32, s64},
407                  {s32, p0},
408                  {v4s32, v4s32},
409                  {v2s32, v2s32},
410                  {v2s64, v2s64},
411                  {v2s64, v2p0},
412                  {v4s16, v4s16},
413                  {v8s16, v8s16},
414                  {v8s8, v8s8},
415                  {v16s8, v16s8}})
416       .widenScalarOrEltToNextPow2(1)
417       .clampScalar(1, s32, s64)
418       .clampScalar(0, s32, s32)
419       .minScalarEltSameAsIf(
420           [=](const LegalityQuery &Query) {
421             const LLT &Ty = Query.Types[0];
422             const LLT &SrcTy = Query.Types[1];
423             return Ty.isVector() && !SrcTy.getElementType().isPointer() &&
424                    Ty.getElementType() != SrcTy.getElementType();
425           },
426           0, 1)
427       .minScalarOrEltIf(
428           [=](const LegalityQuery &Query) { return Query.Types[1] == v2s16; },
429           1, s32)
430       .minScalarOrEltIf(
431           [=](const LegalityQuery &Query) { return Query.Types[1] == v2p0; }, 0,
432           s64)
433       .clampNumElements(0, v2s32, v4s32);
434 
435   // Extensions
436   auto ExtLegalFunc = [=](const LegalityQuery &Query) {
437     unsigned DstSize = Query.Types[0].getSizeInBits();
438 
439     if (DstSize == 128 && !Query.Types[0].isVector())
440       return false; // Extending to a scalar s128 needs narrowing.
441 
442     // Make sure that we have something that will fit in a register, and
443     // make sure it's a power of 2.
444     if (DstSize < 8 || DstSize > 128 || !isPowerOf2_32(DstSize))
445       return false;
446 
447     const LLT &SrcTy = Query.Types[1];
448 
449     // Make sure we fit in a register otherwise. Don't bother checking that
450     // the source type is below 128 bits. We shouldn't be allowing anything
451     // through which is wider than the destination in the first place.
452     unsigned SrcSize = SrcTy.getSizeInBits();
453     if (SrcSize < 8 || !isPowerOf2_32(SrcSize))
454       return false;
455 
456     return true;
457   };
458   getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
459       .legalIf(ExtLegalFunc)
460       .clampScalar(0, s64, s64); // Just for s128, others are handled above.
461 
462   getActionDefinitionsBuilder(G_TRUNC)
463       .minScalarOrEltIf(
464           [=](const LegalityQuery &Query) { return Query.Types[0].isVector(); },
465           0, s8)
466       .customIf([=](const LegalityQuery &Query) {
467         LLT DstTy = Query.Types[0];
468         LLT SrcTy = Query.Types[1];
469         return DstTy == v8s8 && SrcTy.getSizeInBits() > 128;
470       })
471       .alwaysLegal();
472 
473   getActionDefinitionsBuilder(G_SEXT_INREG)
474       .legalFor({s32, s64})
475       .legalFor(PackedVectorAllTypeList)
476       .lower();
477 
478   // FP conversions
479   getActionDefinitionsBuilder(G_FPTRUNC)
480       .legalFor(
481           {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}})
482       .clampMaxNumElements(0, s32, 2);
483   getActionDefinitionsBuilder(G_FPEXT)
484       .legalFor(
485           {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}})
486       .clampMaxNumElements(0, s64, 2);
487 
488   // Conversions
489   getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
490       .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32})
491       .widenScalarToNextPow2(0)
492       .clampScalar(0, s32, s64)
493       .widenScalarToNextPow2(1)
494       .clampScalar(1, s32, s64);
495 
496   getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
497       .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32})
498       .clampScalar(1, s32, s64)
499       .minScalarSameAs(1, 0)
500       .clampScalar(0, s32, s64)
501       .widenScalarToNextPow2(0);
502 
503   // Control-flow
504   getActionDefinitionsBuilder(G_BRCOND)
505     .legalFor({s32})
506     .clampScalar(0, s32, s32);
507   getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0});
508 
509   getActionDefinitionsBuilder(G_SELECT)
510       .legalFor({{s32, s32}, {s64, s32}, {p0, s32}})
511       .widenScalarToNextPow2(0)
512       .clampScalar(0, s32, s64)
513       .clampScalar(1, s32, s32)
514       .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0)
515       .lowerIf(isVector(0));
516 
517   // Pointer-handling
518   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
519 
520   if (TM.getCodeModel() == CodeModel::Small)
521     getActionDefinitionsBuilder(G_GLOBAL_VALUE).custom();
522   else
523     getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
524 
525   getActionDefinitionsBuilder(G_PTRTOINT)
526       .legalFor({{s64, p0}, {v2s64, v2p0}})
527       .widenScalarToNextPow2(0, 64)
528       .clampScalar(0, s64, s64);
529 
530   getActionDefinitionsBuilder(G_INTTOPTR)
531       .unsupportedIf([&](const LegalityQuery &Query) {
532         return Query.Types[0].getSizeInBits() != Query.Types[1].getSizeInBits();
533       })
534       .legalFor({{p0, s64}, {v2p0, v2s64}});
535 
536   // Casts for 32 and 64-bit width type are just copies.
537   // Same for 128-bit width type, except they are on the FPR bank.
538   getActionDefinitionsBuilder(G_BITCAST)
539       // FIXME: This is wrong since G_BITCAST is not allowed to change the
540       // number of bits but it's what the previous code described and fixing
541       // it breaks tests.
542       .legalForCartesianProduct({s8, s16, s32, s64, s128, v16s8, v8s8, v4s8,
543                                  v8s16, v4s16, v2s16, v4s32, v2s32, v2s64,
544                                  v2p0});
545 
546   getActionDefinitionsBuilder(G_VASTART).legalFor({p0});
547 
548   // va_list must be a pointer, but most sized types are pretty easy to handle
549   // as the destination.
550   getActionDefinitionsBuilder(G_VAARG)
551       .customForCartesianProduct({s8, s16, s32, s64, p0}, {p0})
552       .clampScalar(0, s8, s64)
553       .widenScalarToNextPow2(0, /*Min*/ 8);
554 
555   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
556       .lowerIf(
557           all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0)));
558 
559   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
560       .customIf([](const LegalityQuery &Query) {
561         return Query.Types[0].getSizeInBits() == 128;
562       })
563       .clampScalar(0, s32, s64)
564       .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)));
565 
566   getActionDefinitionsBuilder(
567       {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND,
568        G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MIN, G_ATOMICRMW_MAX,
569        G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX})
570       .clampScalar(0, s32, s64)
571       .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)));
572 
573   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({p0});
574 
575   // Merge/Unmerge
576   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
577     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
578     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
579     getActionDefinitionsBuilder(Op)
580         .widenScalarToNextPow2(LitTyIdx, 8)
581         .widenScalarToNextPow2(BigTyIdx, 32)
582         .clampScalar(LitTyIdx, s8, s64)
583         .clampScalar(BigTyIdx, s32, s128)
584         .legalIf([=](const LegalityQuery &Q) {
585           switch (Q.Types[BigTyIdx].getSizeInBits()) {
586           case 32:
587           case 64:
588           case 128:
589             break;
590           default:
591             return false;
592           }
593           switch (Q.Types[LitTyIdx].getSizeInBits()) {
594           case 8:
595           case 16:
596           case 32:
597           case 64:
598             return true;
599           default:
600             return false;
601           }
602         });
603   }
604 
605   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
606       .unsupportedIf([=](const LegalityQuery &Query) {
607         const LLT &EltTy = Query.Types[1].getElementType();
608         return Query.Types[0] != EltTy;
609       })
610       .minScalar(2, s64)
611       .legalIf([=](const LegalityQuery &Query) {
612         const LLT &VecTy = Query.Types[1];
613         return VecTy == v2s16 || VecTy == v4s16 || VecTy == v8s16 ||
614                VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32 ||
615                VecTy == v8s8 || VecTy == v16s8 || VecTy == v2s32 ||
616                VecTy == v2p0;
617       })
618       .minScalarOrEltIf(
619           [=](const LegalityQuery &Query) {
620             // We want to promote to <M x s1> to <M x s64> if that wouldn't
621             // cause the total vec size to be > 128b.
622             return Query.Types[1].getNumElements() <= 2;
623           },
624           0, s64)
625       .minScalarOrEltIf(
626           [=](const LegalityQuery &Query) {
627             return Query.Types[1].getNumElements() <= 4;
628           },
629           0, s32)
630       .minScalarOrEltIf(
631           [=](const LegalityQuery &Query) {
632             return Query.Types[1].getNumElements() <= 8;
633           },
634           0, s16)
635       .minScalarOrEltIf(
636           [=](const LegalityQuery &Query) {
637             return Query.Types[1].getNumElements() <= 16;
638           },
639           0, s8)
640       .minScalarOrElt(0, s8) // Worst case, we need at least s8.
641       .clampMaxNumElements(1, s64, 2)
642       .clampMaxNumElements(1, s32, 4)
643       .clampMaxNumElements(1, s16, 8)
644       .clampMaxNumElements(1, p0, 2);
645 
646   getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT)
647       .legalIf(typeInSet(0, {v8s16, v2s32, v4s32, v2s64}));
648 
649   getActionDefinitionsBuilder(G_BUILD_VECTOR)
650       .legalFor({{v8s8, s8},
651                  {v16s8, s8},
652                  {v2s16, s16},
653                  {v4s16, s16},
654                  {v8s16, s16},
655                  {v2s32, s32},
656                  {v4s32, s32},
657                  {v2p0, p0},
658                  {v2s64, s64}})
659       .clampNumElements(0, v4s32, v4s32)
660       .clampNumElements(0, v2s64, v2s64)
661       .minScalarOrElt(0, s8)
662       .minScalarSameAs(1, 0);
663 
664   getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC).lower();
665 
666   getActionDefinitionsBuilder(G_CTLZ)
667       .legalForCartesianProduct(
668           {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
669       .scalarize(1);
670   getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).lower();
671 
672   // TODO: Custom lowering for v2s32, v4s32, v2s64.
673   getActionDefinitionsBuilder(G_BITREVERSE)
674       .legalFor({s32, s64, v8s8, v16s8})
675       .widenScalarToNextPow2(0, /*Min = */ 32)
676       .clampScalar(0, s32, s64);
677 
678   getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF).lower();
679 
680   getActionDefinitionsBuilder(G_CTTZ)
681       .lowerIf(isVector(0))
682       .clampScalar(0, s32, s64)
683       .scalarSameSizeAs(1, 0)
684       .legalIf([=](const LegalityQuery &Query) {
685         return (HasCSSC && typeInSet(0, {s32, s64})(Query));
686       })
687       .customIf([=](const LegalityQuery &Query) {
688         return (!HasCSSC && typeInSet(0, {s32, s64})(Query));
689       });
690 
691   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
692       .legalIf([=](const LegalityQuery &Query) {
693         const LLT &DstTy = Query.Types[0];
694         const LLT &SrcTy = Query.Types[1];
695         // For now just support the TBL2 variant which needs the source vectors
696         // to be the same size as the dest.
697         if (DstTy != SrcTy)
698           return false;
699         return llvm::is_contained({v2s32, v4s32, v2s64, v2p0, v16s8, v8s16},
700                                   DstTy);
701       })
702       // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we
703       // just want those lowered into G_BUILD_VECTOR
704       .lowerIf([=](const LegalityQuery &Query) {
705         return !Query.Types[1].isVector();
706       })
707       .moreElementsIf(
708           [](const LegalityQuery &Query) {
709             return Query.Types[0].isVector() && Query.Types[1].isVector() &&
710                    Query.Types[0].getNumElements() >
711                        Query.Types[1].getNumElements();
712           },
713           changeTo(1, 0))
714       .moreElementsToNextPow2(0)
715       .clampNumElements(0, v4s32, v4s32)
716       .clampNumElements(0, v2s64, v2s64);
717 
718   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
719       .legalFor({{v4s32, v2s32}, {v8s16, v4s16}, {v16s8, v8s8}});
720 
721   getActionDefinitionsBuilder(G_JUMP_TABLE).legalFor({{p0}, {s64}});
722 
723   getActionDefinitionsBuilder(G_BRJT).legalIf([=](const LegalityQuery &Query) {
724     return Query.Types[0] == p0 && Query.Types[1] == s64;
725   });
726 
727   getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower();
728 
729   if (ST.hasMOPS()) {
730     // G_BZERO is not supported. Currently it is only emitted by
731     // PreLegalizerCombiner for G_MEMSET with zero constant.
732     getActionDefinitionsBuilder(G_BZERO).unsupported();
733 
734     getActionDefinitionsBuilder(G_MEMSET)
735         .legalForCartesianProduct({p0}, {s64}, {s64})
736         .customForCartesianProduct({p0}, {s8}, {s64})
737         .immIdx(0); // Inform verifier imm idx 0 is handled.
738 
739     getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE})
740         .legalForCartesianProduct({p0}, {p0}, {s64})
741         .immIdx(0); // Inform verifier imm idx 0 is handled.
742 
743     // G_MEMCPY_INLINE does not have a tailcall immediate
744     getActionDefinitionsBuilder(G_MEMCPY_INLINE)
745         .legalForCartesianProduct({p0}, {p0}, {s64});
746 
747   } else {
748     getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET})
749         .libcall();
750   }
751 
752   // FIXME: Legal vector types are only legal with NEON.
753   auto &ABSActions = getActionDefinitionsBuilder(G_ABS);
754   if (HasCSSC)
755     ABSActions
756         .legalFor({s32, s64});
757   ABSActions
758       .legalFor(PackedVectorAllTypeList)
759       .lowerIf(isScalar(0));
760 
761   getActionDefinitionsBuilder(G_VECREDUCE_FADD)
762       // We only have FADDP to do reduction-like operations. Lower the rest.
763       .legalFor({{s32, v2s32}, {s64, v2s64}})
764       .clampMaxNumElements(1, s64, 2)
765       .clampMaxNumElements(1, s32, 2)
766       .lower();
767 
768   getActionDefinitionsBuilder(G_VECREDUCE_ADD)
769       .legalFor(
770           {{s8, v16s8}, {s16, v8s16}, {s32, v4s32}, {s32, v2s32}, {s64, v2s64}})
771       .clampMaxNumElements(1, s64, 2)
772       .clampMaxNumElements(1, s32, 4)
773       .lower();
774 
775   getActionDefinitionsBuilder(
776       {G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
777       // Try to break down into smaller vectors as long as they're at least 64
778       // bits. This lets us use vector operations for some parts of the
779       // reduction.
780       .fewerElementsIf(
781           [=](const LegalityQuery &Q) {
782             LLT SrcTy = Q.Types[1];
783             if (SrcTy.isScalar())
784               return false;
785             if (!isPowerOf2_32(SrcTy.getNumElements()))
786               return false;
787             // We can usually perform 64b vector operations.
788             return SrcTy.getSizeInBits() > 64;
789           },
790           [=](const LegalityQuery &Q) {
791             LLT SrcTy = Q.Types[1];
792             return std::make_pair(1, SrcTy.divide(2));
793           })
794       .scalarize(1)
795       .lower();
796 
797   getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
798       .lowerIf([=](const LegalityQuery &Q) { return Q.Types[0].isScalar(); });
799 
800   getActionDefinitionsBuilder({G_FSHL, G_FSHR}).lower();
801 
802   getActionDefinitionsBuilder(G_ROTR)
803       .legalFor({{s32, s64}, {s64, s64}})
804       .customIf([=](const LegalityQuery &Q) {
805         return Q.Types[0].isScalar() && Q.Types[1].getScalarSizeInBits() < 64;
806       })
807       .lower();
808   getActionDefinitionsBuilder(G_ROTL).lower();
809 
810   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
811       .customFor({{s32, s32}, {s64, s64}});
812 
813   auto always = [=](const LegalityQuery &Q) { return true; };
814   auto &CTPOPActions = getActionDefinitionsBuilder(G_CTPOP);
815   if (HasCSSC)
816     CTPOPActions
817         .legalFor({{s32, s32},
818                    {s64, s64},
819                    {v8s8, v8s8},
820                    {v16s8, v16s8}})
821         .customFor({{s128, s128},
822                     {v2s64, v2s64},
823                     {v2s32, v2s32},
824                     {v4s32, v4s32},
825                     {v4s16, v4s16},
826                     {v8s16, v8s16}});
827   else
828     CTPOPActions
829         .legalFor({{v8s8, v8s8},
830                    {v16s8, v16s8}})
831         .customFor({{s32, s32},
832                     {s64, s64},
833                     {s128, s128},
834                     {v2s64, v2s64},
835                     {v2s32, v2s32},
836                     {v4s32, v4s32},
837                     {v4s16, v4s16},
838                     {v8s16, v8s16}});
839   CTPOPActions
840       .clampScalar(0, s32, s128)
841       .widenScalarToNextPow2(0)
842       .minScalarEltSameAsIf(always, 1, 0)
843       .maxScalarEltSameAsIf(always, 1, 0);
844 
845   // TODO: Vector types.
846   getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}).lowerIf(isScalar(0));
847 
848   // TODO: Vector types.
849   getActionDefinitionsBuilder({G_FMAXNUM, G_FMINNUM})
850       .legalFor({MinFPScalar, s32, s64})
851       .libcallFor({s128})
852       .minScalar(0, MinFPScalar);
853 
854   getActionDefinitionsBuilder({G_FMAXIMUM, G_FMINIMUM})
855       .legalFor({MinFPScalar, s32, s64, v2s32, v4s32, v2s64})
856       .legalIf([=](const LegalityQuery &Query) {
857         const auto &Ty = Query.Types[0];
858         return (Ty == v8s16 || Ty == v4s16) && HasFP16;
859       })
860       .minScalar(0, MinFPScalar)
861       .clampNumElements(0, v4s16, v8s16)
862       .clampNumElements(0, v2s32, v4s32)
863       .clampNumElements(0, v2s64, v2s64);
864 
865   // TODO: Libcall support for s128.
866   // TODO: s16 should be legal with full FP16 support.
867   getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
868       .legalFor({{s64, s32}, {s64, s64}});
869 
870   // TODO: Custom legalization for vector types.
871   // TODO: Custom legalization for mismatched types.
872   // TODO: s16 support.
873   getActionDefinitionsBuilder(G_FCOPYSIGN).customFor({{s32, s32}, {s64, s64}});
874 
875   getActionDefinitionsBuilder(G_FMAD).lower();
876 
877   getLegacyLegalizerInfo().computeTables();
878   verify(*ST.getInstrInfo());
879 }
880 
881 bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
882                                           MachineInstr &MI) const {
883   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
884   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
885   GISelChangeObserver &Observer = Helper.Observer;
886   switch (MI.getOpcode()) {
887   default:
888     // No idea what to do.
889     return false;
890   case TargetOpcode::G_VAARG:
891     return legalizeVaArg(MI, MRI, MIRBuilder);
892   case TargetOpcode::G_LOAD:
893   case TargetOpcode::G_STORE:
894     return legalizeLoadStore(MI, MRI, MIRBuilder, Observer);
895   case TargetOpcode::G_SHL:
896   case TargetOpcode::G_ASHR:
897   case TargetOpcode::G_LSHR:
898     return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer);
899   case TargetOpcode::G_GLOBAL_VALUE:
900     return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer);
901   case TargetOpcode::G_TRUNC:
902     return legalizeVectorTrunc(MI, Helper);
903   case TargetOpcode::G_SBFX:
904   case TargetOpcode::G_UBFX:
905     return legalizeBitfieldExtract(MI, MRI, Helper);
906   case TargetOpcode::G_ROTR:
907     return legalizeRotate(MI, MRI, Helper);
908   case TargetOpcode::G_CTPOP:
909     return legalizeCTPOP(MI, MRI, Helper);
910   case TargetOpcode::G_ATOMIC_CMPXCHG:
911     return legalizeAtomicCmpxchg128(MI, MRI, Helper);
912   case TargetOpcode::G_CTTZ:
913     return legalizeCTTZ(MI, Helper);
914   case TargetOpcode::G_BZERO:
915   case TargetOpcode::G_MEMCPY:
916   case TargetOpcode::G_MEMMOVE:
917   case TargetOpcode::G_MEMSET:
918     return legalizeMemOps(MI, Helper);
919   case TargetOpcode::G_FCOPYSIGN:
920     return legalizeFCopySign(MI, Helper);
921   }
922 
923   llvm_unreachable("expected switch to return");
924 }
925 
926 bool AArch64LegalizerInfo::legalizeRotate(MachineInstr &MI,
927                                           MachineRegisterInfo &MRI,
928                                           LegalizerHelper &Helper) const {
929   // To allow for imported patterns to match, we ensure that the rotate amount
930   // is 64b with an extension.
931   Register AmtReg = MI.getOperand(2).getReg();
932   LLT AmtTy = MRI.getType(AmtReg);
933   (void)AmtTy;
934   assert(AmtTy.isScalar() && "Expected a scalar rotate");
935   assert(AmtTy.getSizeInBits() < 64 && "Expected this rotate to be legal");
936   auto NewAmt = Helper.MIRBuilder.buildZExt(LLT::scalar(64), AmtReg);
937   Helper.Observer.changingInstr(MI);
938   MI.getOperand(2).setReg(NewAmt.getReg(0));
939   Helper.Observer.changedInstr(MI);
940   return true;
941 }
942 
943 static void extractParts(Register Reg, MachineRegisterInfo &MRI,
944                          MachineIRBuilder &MIRBuilder, LLT Ty, int NumParts,
945                          SmallVectorImpl<Register> &VRegs) {
946   for (int I = 0; I < NumParts; ++I)
947     VRegs.push_back(MRI.createGenericVirtualRegister(Ty));
948   MIRBuilder.buildUnmerge(VRegs, Reg);
949 }
950 
951 bool AArch64LegalizerInfo::legalizeVectorTrunc(
952     MachineInstr &MI, LegalizerHelper &Helper) const {
953   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
954   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
955   // Similar to how operand splitting is done in SelectiondDAG, we can handle
956   // %res(v8s8) = G_TRUNC %in(v8s32) by generating:
957   //   %inlo(<4x s32>), %inhi(<4 x s32>) = G_UNMERGE %in(<8 x s32>)
958   //   %lo16(<4 x s16>) = G_TRUNC %inlo
959   //   %hi16(<4 x s16>) = G_TRUNC %inhi
960   //   %in16(<8 x s16>) = G_CONCAT_VECTORS %lo16, %hi16
961   //   %res(<8 x s8>) = G_TRUNC %in16
962 
963   Register DstReg = MI.getOperand(0).getReg();
964   Register SrcReg = MI.getOperand(1).getReg();
965   LLT DstTy = MRI.getType(DstReg);
966   LLT SrcTy = MRI.getType(SrcReg);
967   assert(isPowerOf2_32(DstTy.getSizeInBits()) &&
968          isPowerOf2_32(SrcTy.getSizeInBits()));
969 
970   // Split input type.
971   LLT SplitSrcTy =
972       SrcTy.changeElementCount(SrcTy.getElementCount().divideCoefficientBy(2));
973   // First, split the source into two smaller vectors.
974   SmallVector<Register, 2> SplitSrcs;
975   extractParts(SrcReg, MRI, MIRBuilder, SplitSrcTy, 2, SplitSrcs);
976 
977   // Truncate the splits into intermediate narrower elements.
978   LLT InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits() * 2);
979   for (unsigned I = 0; I < SplitSrcs.size(); ++I)
980     SplitSrcs[I] = MIRBuilder.buildTrunc(InterTy, SplitSrcs[I]).getReg(0);
981 
982   auto Concat = MIRBuilder.buildConcatVectors(
983       DstTy.changeElementSize(DstTy.getScalarSizeInBits() * 2), SplitSrcs);
984 
985   Helper.Observer.changingInstr(MI);
986   MI.getOperand(1).setReg(Concat.getReg(0));
987   Helper.Observer.changedInstr(MI);
988   return true;
989 }
990 
991 bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(
992     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
993     GISelChangeObserver &Observer) const {
994   assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE);
995   // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP +
996   // G_ADD_LOW instructions.
997   // By splitting this here, we can optimize accesses in the small code model by
998   // folding in the G_ADD_LOW into the load/store offset.
999   auto &GlobalOp = MI.getOperand(1);
1000   const auto* GV = GlobalOp.getGlobal();
1001   if (GV->isThreadLocal())
1002     return true; // Don't want to modify TLS vars.
1003 
1004   auto &TM = ST->getTargetLowering()->getTargetMachine();
1005   unsigned OpFlags = ST->ClassifyGlobalReference(GV, TM);
1006 
1007   if (OpFlags & AArch64II::MO_GOT)
1008     return true;
1009 
1010   auto Offset = GlobalOp.getOffset();
1011   Register DstReg = MI.getOperand(0).getReg();
1012   auto ADRP = MIRBuilder.buildInstr(AArch64::ADRP, {LLT::pointer(0, 64)}, {})
1013                   .addGlobalAddress(GV, Offset, OpFlags | AArch64II::MO_PAGE);
1014   // Set the regclass on the dest reg too.
1015   MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
1016 
1017   // MO_TAGGED on the page indicates a tagged address. Set the tag now. We do so
1018   // by creating a MOVK that sets bits 48-63 of the register to (global address
1019   // + 0x100000000 - PC) >> 48. The additional 0x100000000 offset here is to
1020   // prevent an incorrect tag being generated during relocation when the the
1021   // global appears before the code section. Without the offset, a global at
1022   // `0x0f00'0000'0000'1000` (i.e. at `0x1000` with tag `0xf`) that's referenced
1023   // by code at `0x2000` would result in `0x0f00'0000'0000'1000 - 0x2000 =
1024   // 0x0eff'ffff'ffff'f000`, meaning the tag would be incorrectly set to `0xe`
1025   // instead of `0xf`.
1026   // This assumes that we're in the small code model so we can assume a binary
1027   // size of <= 4GB, which makes the untagged PC relative offset positive. The
1028   // binary must also be loaded into address range [0, 2^48). Both of these
1029   // properties need to be ensured at runtime when using tagged addresses.
1030   if (OpFlags & AArch64II::MO_TAGGED) {
1031     assert(!Offset &&
1032            "Should not have folded in an offset for a tagged global!");
1033     ADRP = MIRBuilder.buildInstr(AArch64::MOVKXi, {LLT::pointer(0, 64)}, {ADRP})
1034                .addGlobalAddress(GV, 0x100000000,
1035                                  AArch64II::MO_PREL | AArch64II::MO_G3)
1036                .addImm(48);
1037     MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
1038   }
1039 
1040   MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP})
1041       .addGlobalAddress(GV, Offset,
1042                         OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
1043   MI.eraseFromParent();
1044   return true;
1045 }
1046 
1047 bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
1048                                              MachineInstr &MI) const {
1049   switch (MI.getIntrinsicID()) {
1050   case Intrinsic::vacopy: {
1051     unsigned PtrSize = ST->isTargetILP32() ? 4 : 8;
1052     unsigned VaListSize =
1053       (ST->isTargetDarwin() || ST->isTargetWindows())
1054           ? PtrSize
1055           : ST->isTargetILP32() ? 20 : 32;
1056 
1057     MachineFunction &MF = *MI.getMF();
1058     auto Val = MF.getRegInfo().createGenericVirtualRegister(
1059         LLT::scalar(VaListSize * 8));
1060     MachineIRBuilder MIB(MI);
1061     MIB.buildLoad(Val, MI.getOperand(2),
1062                   *MF.getMachineMemOperand(MachinePointerInfo(),
1063                                            MachineMemOperand::MOLoad,
1064                                            VaListSize, Align(PtrSize)));
1065     MIB.buildStore(Val, MI.getOperand(1),
1066                    *MF.getMachineMemOperand(MachinePointerInfo(),
1067                                             MachineMemOperand::MOStore,
1068                                             VaListSize, Align(PtrSize)));
1069     MI.eraseFromParent();
1070     return true;
1071   }
1072   case Intrinsic::get_dynamic_area_offset: {
1073     MachineIRBuilder &MIB = Helper.MIRBuilder;
1074     MIB.buildConstant(MI.getOperand(0).getReg(), 0);
1075     MI.eraseFromParent();
1076     return true;
1077   }
1078   case Intrinsic::aarch64_mops_memset_tag: {
1079     assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
1080     // Zext the value to 64 bit
1081     MachineIRBuilder MIB(MI);
1082     auto &Value = MI.getOperand(3);
1083     Register ZExtValueReg = MIB.buildAnyExt(LLT::scalar(64), Value).getReg(0);
1084     Value.setReg(ZExtValueReg);
1085     return true;
1086   }
1087   case Intrinsic::prefetch: {
1088     MachineIRBuilder MIB(MI);
1089     auto &AddrVal = MI.getOperand(1);
1090 
1091     int64_t IsWrite = MI.getOperand(2).getImm();
1092     int64_t Locality = MI.getOperand(3).getImm();
1093     int64_t IsData = MI.getOperand(4).getImm();
1094 
1095     bool IsStream = Locality == 0;
1096     if (Locality != 0) {
1097       assert(Locality <= 3 && "Prefetch locality out-of-range");
1098       // The locality degree is the opposite of the cache speed.
1099       // Put the number the other way around.
1100       // The encoding starts at 0 for level 1
1101       Locality = 3 - Locality;
1102     }
1103 
1104     unsigned PrfOp =
1105         (IsWrite << 4) | (!IsData << 3) | (Locality << 1) | IsStream;
1106 
1107     MIB.buildInstr(AArch64::G_PREFETCH).addImm(PrfOp).add(AddrVal);
1108     MI.eraseFromParent();
1109     return true;
1110   }
1111   case Intrinsic::aarch64_prefetch: {
1112     MachineIRBuilder MIB(MI);
1113     auto &AddrVal = MI.getOperand(1);
1114 
1115     int64_t IsWrite = MI.getOperand(2).getImm();
1116     int64_t Target = MI.getOperand(3).getImm();
1117     int64_t IsStream = MI.getOperand(4).getImm();
1118     int64_t IsData = MI.getOperand(5).getImm();
1119 
1120     unsigned PrfOp = (IsWrite << 4) |    // Load/Store bit
1121                      (!IsData << 3) |    // IsDataCache bit
1122                      (Target << 1) |     // Cache level bits
1123                      (unsigned)IsStream; // Stream bit
1124 
1125     MIB.buildInstr(AArch64::G_PREFETCH).addImm(PrfOp).add(AddrVal);
1126     MI.eraseFromParent();
1127     return true;
1128   }
1129   }
1130 
1131   return true;
1132 }
1133 
1134 bool AArch64LegalizerInfo::legalizeShlAshrLshr(
1135     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
1136     GISelChangeObserver &Observer) const {
1137   assert(MI.getOpcode() == TargetOpcode::G_ASHR ||
1138          MI.getOpcode() == TargetOpcode::G_LSHR ||
1139          MI.getOpcode() == TargetOpcode::G_SHL);
1140   // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the
1141   // imported patterns can select it later. Either way, it will be legal.
1142   Register AmtReg = MI.getOperand(2).getReg();
1143   auto VRegAndVal = getIConstantVRegValWithLookThrough(AmtReg, MRI);
1144   if (!VRegAndVal)
1145     return true;
1146   // Check the shift amount is in range for an immediate form.
1147   int64_t Amount = VRegAndVal->Value.getSExtValue();
1148   if (Amount > 31)
1149     return true; // This will have to remain a register variant.
1150   auto ExtCst = MIRBuilder.buildConstant(LLT::scalar(64), Amount);
1151   Observer.changingInstr(MI);
1152   MI.getOperand(2).setReg(ExtCst.getReg(0));
1153   Observer.changedInstr(MI);
1154   return true;
1155 }
1156 
1157 static void matchLDPSTPAddrMode(Register Root, Register &Base, int &Offset,
1158                                 MachineRegisterInfo &MRI) {
1159   Base = Root;
1160   Offset = 0;
1161 
1162   Register NewBase;
1163   int64_t NewOffset;
1164   if (mi_match(Root, MRI, m_GPtrAdd(m_Reg(NewBase), m_ICst(NewOffset))) &&
1165       isShiftedInt<7, 3>(NewOffset)) {
1166     Base = NewBase;
1167     Offset = NewOffset;
1168   }
1169 }
1170 
1171 // FIXME: This should be removed and replaced with the generic bitcast legalize
1172 // action.
1173 bool AArch64LegalizerInfo::legalizeLoadStore(
1174     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
1175     GISelChangeObserver &Observer) const {
1176   assert(MI.getOpcode() == TargetOpcode::G_STORE ||
1177          MI.getOpcode() == TargetOpcode::G_LOAD);
1178   // Here we just try to handle vector loads/stores where our value type might
1179   // have pointer elements, which the SelectionDAG importer can't handle. To
1180   // allow the existing patterns for s64 to fire for p0, we just try to bitcast
1181   // the value to use s64 types.
1182 
1183   // Custom legalization requires the instruction, if not deleted, must be fully
1184   // legalized. In order to allow further legalization of the inst, we create
1185   // a new instruction and erase the existing one.
1186 
1187   Register ValReg = MI.getOperand(0).getReg();
1188   const LLT ValTy = MRI.getType(ValReg);
1189 
1190   if (ValTy == LLT::scalar(128)) {
1191     assert((*MI.memoperands_begin())->getSuccessOrdering() ==
1192                AtomicOrdering::Monotonic ||
1193            (*MI.memoperands_begin())->getSuccessOrdering() ==
1194                AtomicOrdering::Unordered);
1195     assert(ST->hasLSE2() && "ldp/stp not single copy atomic without +lse2");
1196     LLT s64 = LLT::scalar(64);
1197     MachineInstrBuilder NewI;
1198     if (MI.getOpcode() == TargetOpcode::G_LOAD) {
1199       NewI = MIRBuilder.buildInstr(AArch64::LDPXi, {s64, s64}, {});
1200       MIRBuilder.buildMergeLikeInstr(
1201           ValReg, {NewI->getOperand(0), NewI->getOperand(1)});
1202     } else {
1203       auto Split = MIRBuilder.buildUnmerge(s64, MI.getOperand(0));
1204       NewI = MIRBuilder.buildInstr(
1205           AArch64::STPXi, {}, {Split->getOperand(0), Split->getOperand(1)});
1206     }
1207     Register Base;
1208     int Offset;
1209     matchLDPSTPAddrMode(MI.getOperand(1).getReg(), Base, Offset, MRI);
1210     NewI.addUse(Base);
1211     NewI.addImm(Offset / 8);
1212 
1213     NewI.cloneMemRefs(MI);
1214     constrainSelectedInstRegOperands(*NewI, *ST->getInstrInfo(),
1215                                      *MRI.getTargetRegisterInfo(),
1216                                      *ST->getRegBankInfo());
1217     MI.eraseFromParent();
1218     return true;
1219   }
1220 
1221   if (!ValTy.isVector() || !ValTy.getElementType().isPointer() ||
1222       ValTy.getElementType().getAddressSpace() != 0) {
1223     LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store");
1224     return false;
1225   }
1226 
1227   unsigned PtrSize = ValTy.getElementType().getSizeInBits();
1228   const LLT NewTy = LLT::vector(ValTy.getElementCount(), PtrSize);
1229   auto &MMO = **MI.memoperands_begin();
1230   MMO.setType(NewTy);
1231 
1232   if (MI.getOpcode() == TargetOpcode::G_STORE) {
1233     auto Bitcast = MIRBuilder.buildBitcast(NewTy, ValReg);
1234     MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1), MMO);
1235   } else {
1236     auto NewLoad = MIRBuilder.buildLoad(NewTy, MI.getOperand(1), MMO);
1237     MIRBuilder.buildBitcast(ValReg, NewLoad);
1238   }
1239   MI.eraseFromParent();
1240   return true;
1241 }
1242 
1243 bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
1244                                          MachineRegisterInfo &MRI,
1245                                          MachineIRBuilder &MIRBuilder) const {
1246   MachineFunction &MF = MIRBuilder.getMF();
1247   Align Alignment(MI.getOperand(2).getImm());
1248   Register Dst = MI.getOperand(0).getReg();
1249   Register ListPtr = MI.getOperand(1).getReg();
1250 
1251   LLT PtrTy = MRI.getType(ListPtr);
1252   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1253 
1254   const unsigned PtrSize = PtrTy.getSizeInBits() / 8;
1255   const Align PtrAlign = Align(PtrSize);
1256   auto List = MIRBuilder.buildLoad(
1257       PtrTy, ListPtr,
1258       *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
1259                                PtrTy, PtrAlign));
1260 
1261   MachineInstrBuilder DstPtr;
1262   if (Alignment > PtrAlign) {
1263     // Realign the list to the actual required alignment.
1264     auto AlignMinus1 =
1265         MIRBuilder.buildConstant(IntPtrTy, Alignment.value() - 1);
1266     auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0));
1267     DstPtr = MIRBuilder.buildMaskLowPtrBits(PtrTy, ListTmp, Log2(Alignment));
1268   } else
1269     DstPtr = List;
1270 
1271   LLT ValTy = MRI.getType(Dst);
1272   uint64_t ValSize = ValTy.getSizeInBits() / 8;
1273   MIRBuilder.buildLoad(
1274       Dst, DstPtr,
1275       *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
1276                                ValTy, std::max(Alignment, PtrAlign)));
1277 
1278   auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrAlign));
1279 
1280   auto NewList = MIRBuilder.buildPtrAdd(PtrTy, DstPtr, Size.getReg(0));
1281 
1282   MIRBuilder.buildStore(NewList, ListPtr,
1283                         *MF.getMachineMemOperand(MachinePointerInfo(),
1284                                                  MachineMemOperand::MOStore,
1285                                                  PtrTy, PtrAlign));
1286 
1287   MI.eraseFromParent();
1288   return true;
1289 }
1290 
1291 bool AArch64LegalizerInfo::legalizeBitfieldExtract(
1292     MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
1293   // Only legal if we can select immediate forms.
1294   // TODO: Lower this otherwise.
1295   return getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI) &&
1296          getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
1297 }
1298 
1299 bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI,
1300                                          MachineRegisterInfo &MRI,
1301                                          LegalizerHelper &Helper) const {
1302   // When there is no integer popcount instruction (FEAT_CSSC isn't available),
1303   // it can be more efficiently lowered to the following sequence that uses
1304   // AdvSIMD registers/instructions as long as the copies to/from the AdvSIMD
1305   // registers are cheap.
1306   //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
1307   //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
1308   //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
1309   //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
1310   //
1311   // For 128 bit vector popcounts, we lower to the following sequence:
1312   //  cnt.16b   v0, v0  // v8s16, v4s32, v2s64
1313   //  uaddlp.8h v0, v0  // v8s16, v4s32, v2s64
1314   //  uaddlp.4s v0, v0  //        v4s32, v2s64
1315   //  uaddlp.2d v0, v0  //               v2s64
1316   //
1317   // For 64 bit vector popcounts, we lower to the following sequence:
1318   //  cnt.8b    v0, v0  // v4s16, v2s32
1319   //  uaddlp.4h v0, v0  // v4s16, v2s32
1320   //  uaddlp.2s v0, v0  //        v2s32
1321 
1322   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1323   Register Dst = MI.getOperand(0).getReg();
1324   Register Val = MI.getOperand(1).getReg();
1325   LLT Ty = MRI.getType(Val);
1326   unsigned Size = Ty.getSizeInBits();
1327 
1328   assert(Ty == MRI.getType(Dst) &&
1329          "Expected src and dst to have the same type!");
1330 
1331   if (ST->hasCSSC() && Ty.isScalar() && Size == 128) {
1332     LLT s64 = LLT::scalar(64);
1333 
1334     auto Split = MIRBuilder.buildUnmerge(s64, Val);
1335     auto CTPOP1 = MIRBuilder.buildCTPOP(s64, Split->getOperand(0));
1336     auto CTPOP2 = MIRBuilder.buildCTPOP(s64, Split->getOperand(1));
1337     auto Add = MIRBuilder.buildAdd(s64, CTPOP1, CTPOP2);
1338 
1339     MIRBuilder.buildZExt(Dst, Add);
1340     MI.eraseFromParent();
1341     return true;
1342   }
1343 
1344   if (!ST->hasNEON() ||
1345       MI.getMF()->getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) {
1346     // Use generic lowering when custom lowering is not possible.
1347     return Ty.isScalar() && (Size == 32 || Size == 64) &&
1348            Helper.lowerBitCount(MI) ==
1349                LegalizerHelper::LegalizeResult::Legalized;
1350   }
1351 
1352   // Pre-conditioning: widen Val up to the nearest vector type.
1353   // s32,s64,v4s16,v2s32 -> v8i8
1354   // v8s16,v4s32,v2s64 -> v16i8
1355   LLT VTy = Size == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8);
1356   if (Ty.isScalar()) {
1357     assert((Size == 32 || Size == 64 || Size == 128) && "Expected only 32, 64, or 128 bit scalars!");
1358     if (Size == 32) {
1359       Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0);
1360     }
1361   }
1362   Val = MIRBuilder.buildBitcast(VTy, Val).getReg(0);
1363 
1364   // Count bits in each byte-sized lane.
1365   auto CTPOP = MIRBuilder.buildCTPOP(VTy, Val);
1366 
1367   // Sum across lanes.
1368   Register HSum = CTPOP.getReg(0);
1369   unsigned Opc;
1370   SmallVector<LLT> HAddTys;
1371   if (Ty.isScalar()) {
1372     Opc = Intrinsic::aarch64_neon_uaddlv;
1373     HAddTys.push_back(LLT::scalar(32));
1374   } else if (Ty == LLT::fixed_vector(8, 16)) {
1375     Opc = Intrinsic::aarch64_neon_uaddlp;
1376     HAddTys.push_back(LLT::fixed_vector(8, 16));
1377   } else if (Ty == LLT::fixed_vector(4, 32)) {
1378     Opc = Intrinsic::aarch64_neon_uaddlp;
1379     HAddTys.push_back(LLT::fixed_vector(8, 16));
1380     HAddTys.push_back(LLT::fixed_vector(4, 32));
1381   } else if (Ty == LLT::fixed_vector(2, 64)) {
1382     Opc = Intrinsic::aarch64_neon_uaddlp;
1383     HAddTys.push_back(LLT::fixed_vector(8, 16));
1384     HAddTys.push_back(LLT::fixed_vector(4, 32));
1385     HAddTys.push_back(LLT::fixed_vector(2, 64));
1386   } else if (Ty == LLT::fixed_vector(4, 16)) {
1387     Opc = Intrinsic::aarch64_neon_uaddlp;
1388     HAddTys.push_back(LLT::fixed_vector(4, 16));
1389   } else if (Ty == LLT::fixed_vector(2, 32)) {
1390     Opc = Intrinsic::aarch64_neon_uaddlp;
1391     HAddTys.push_back(LLT::fixed_vector(4, 16));
1392     HAddTys.push_back(LLT::fixed_vector(2, 32));
1393   } else
1394     llvm_unreachable("unexpected vector shape");
1395   MachineInstrBuilder UADD;
1396   for (LLT HTy : HAddTys) {
1397     UADD = MIRBuilder.buildIntrinsic(Opc, {HTy}, /*HasSideEffects =*/false)
1398                      .addUse(HSum);
1399     HSum = UADD.getReg(0);
1400   }
1401 
1402   // Post-conditioning.
1403   if (Ty.isScalar() && (Size == 64 || Size == 128))
1404     MIRBuilder.buildZExt(Dst, UADD);
1405   else
1406     UADD->getOperand(0).setReg(Dst);
1407   MI.eraseFromParent();
1408   return true;
1409 }
1410 
1411 bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128(
1412     MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
1413   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1414   LLT s64 = LLT::scalar(64);
1415   auto Addr = MI.getOperand(1).getReg();
1416   auto DesiredI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(2));
1417   auto NewI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(3));
1418   auto DstLo = MRI.createGenericVirtualRegister(s64);
1419   auto DstHi = MRI.createGenericVirtualRegister(s64);
1420 
1421   MachineInstrBuilder CAS;
1422   if (ST->hasLSE()) {
1423     // We have 128-bit CASP instructions taking XSeqPair registers, which are
1424     // s128. We need the merge/unmerge to bracket the expansion and pair up with
1425     // the rest of the MIR so we must reassemble the extracted registers into a
1426     // 128-bit known-regclass one with code like this:
1427     //
1428     //     %in1 = REG_SEQUENCE Lo, Hi    ; One for each input
1429     //     %out = CASP %in1, ...
1430     //     %OldLo = G_EXTRACT %out, 0
1431     //     %OldHi = G_EXTRACT %out, 64
1432     auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
1433     unsigned Opcode;
1434     switch (Ordering) {
1435     case AtomicOrdering::Acquire:
1436       Opcode = AArch64::CASPAX;
1437       break;
1438     case AtomicOrdering::Release:
1439       Opcode = AArch64::CASPLX;
1440       break;
1441     case AtomicOrdering::AcquireRelease:
1442     case AtomicOrdering::SequentiallyConsistent:
1443       Opcode = AArch64::CASPALX;
1444       break;
1445     default:
1446       Opcode = AArch64::CASPX;
1447       break;
1448     }
1449 
1450     LLT s128 = LLT::scalar(128);
1451     auto CASDst = MRI.createGenericVirtualRegister(s128);
1452     auto CASDesired = MRI.createGenericVirtualRegister(s128);
1453     auto CASNew = MRI.createGenericVirtualRegister(s128);
1454     MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASDesired}, {})
1455         .addUse(DesiredI->getOperand(0).getReg())
1456         .addImm(AArch64::sube64)
1457         .addUse(DesiredI->getOperand(1).getReg())
1458         .addImm(AArch64::subo64);
1459     MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASNew}, {})
1460         .addUse(NewI->getOperand(0).getReg())
1461         .addImm(AArch64::sube64)
1462         .addUse(NewI->getOperand(1).getReg())
1463         .addImm(AArch64::subo64);
1464 
1465     CAS = MIRBuilder.buildInstr(Opcode, {CASDst}, {CASDesired, CASNew, Addr});
1466 
1467     MIRBuilder.buildExtract({DstLo}, {CASDst}, 0);
1468     MIRBuilder.buildExtract({DstHi}, {CASDst}, 64);
1469   } else {
1470     // The -O0 CMP_SWAP_128 is friendlier to generate code for because LDXP/STXP
1471     // can take arbitrary registers so it just has the normal GPR64 operands the
1472     // rest of AArch64 is expecting.
1473     auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
1474     unsigned Opcode;
1475     switch (Ordering) {
1476     case AtomicOrdering::Acquire:
1477       Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
1478       break;
1479     case AtomicOrdering::Release:
1480       Opcode = AArch64::CMP_SWAP_128_RELEASE;
1481       break;
1482     case AtomicOrdering::AcquireRelease:
1483     case AtomicOrdering::SequentiallyConsistent:
1484       Opcode = AArch64::CMP_SWAP_128;
1485       break;
1486     default:
1487       Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
1488       break;
1489     }
1490 
1491     auto Scratch = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
1492     CAS = MIRBuilder.buildInstr(Opcode, {DstLo, DstHi, Scratch},
1493                                 {Addr, DesiredI->getOperand(0),
1494                                  DesiredI->getOperand(1), NewI->getOperand(0),
1495                                  NewI->getOperand(1)});
1496   }
1497 
1498   CAS.cloneMemRefs(MI);
1499   constrainSelectedInstRegOperands(*CAS, *ST->getInstrInfo(),
1500                                    *MRI.getTargetRegisterInfo(),
1501                                    *ST->getRegBankInfo());
1502 
1503   MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {DstLo, DstHi});
1504   MI.eraseFromParent();
1505   return true;
1506 }
1507 
1508 bool AArch64LegalizerInfo::legalizeCTTZ(MachineInstr &MI,
1509                                         LegalizerHelper &Helper) const {
1510   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1511   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
1512   LLT Ty = MRI.getType(MI.getOperand(1).getReg());
1513   auto BitReverse = MIRBuilder.buildBitReverse(Ty, MI.getOperand(1));
1514   MIRBuilder.buildCTLZ(MI.getOperand(0).getReg(), BitReverse);
1515   MI.eraseFromParent();
1516   return true;
1517 }
1518 
1519 bool AArch64LegalizerInfo::legalizeMemOps(MachineInstr &MI,
1520                                           LegalizerHelper &Helper) const {
1521   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1522 
1523   // Tagged version MOPSMemorySetTagged is legalised in legalizeIntrinsic
1524   if (MI.getOpcode() == TargetOpcode::G_MEMSET) {
1525     // Zext the value operand to 64 bit
1526     auto &Value = MI.getOperand(1);
1527     Register ZExtValueReg =
1528         MIRBuilder.buildAnyExt(LLT::scalar(64), Value).getReg(0);
1529     Value.setReg(ZExtValueReg);
1530     return true;
1531   }
1532 
1533   return false;
1534 }
1535 
1536 bool AArch64LegalizerInfo::legalizeFCopySign(MachineInstr &MI,
1537                                              LegalizerHelper &Helper) const {
1538   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1539   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
1540   Register Dst = MI.getOperand(0).getReg();
1541   LLT DstTy = MRI.getType(Dst);
1542   assert(DstTy.isScalar() && "Only expected scalars right now!");
1543   const unsigned DstSize = DstTy.getSizeInBits();
1544   assert((DstSize == 32 || DstSize == 64) && "Unexpected dst type!");
1545   assert(MRI.getType(MI.getOperand(2).getReg()) == DstTy &&
1546          "Expected homogeneous types!");
1547 
1548   // We want to materialize a mask with the high bit set.
1549   uint64_t EltMask;
1550   LLT VecTy;
1551 
1552   // TODO: s16 support.
1553   switch (DstSize) {
1554   default:
1555     llvm_unreachable("Unexpected type for G_FCOPYSIGN!");
1556   case 64: {
1557     // AdvSIMD immediate moves cannot materialize out mask in a single
1558     // instruction for 64-bit elements. Instead, materialize zero and then
1559     // negate it.
1560     EltMask = 0;
1561     VecTy = LLT::fixed_vector(2, DstTy);
1562     break;
1563   }
1564   case 32:
1565     EltMask = 0x80000000ULL;
1566     VecTy = LLT::fixed_vector(4, DstTy);
1567     break;
1568   }
1569 
1570   // Widen In1 and In2 to 128 bits. We want these to eventually become
1571   // INSERT_SUBREGs.
1572   auto Undef = MIRBuilder.buildUndef(VecTy);
1573   auto Zero = MIRBuilder.buildConstant(DstTy, 0);
1574   auto Ins1 = MIRBuilder.buildInsertVectorElement(
1575       VecTy, Undef, MI.getOperand(1).getReg(), Zero);
1576   auto Ins2 = MIRBuilder.buildInsertVectorElement(
1577       VecTy, Undef, MI.getOperand(2).getReg(), Zero);
1578 
1579   // Construct the mask.
1580   auto Mask = MIRBuilder.buildConstant(VecTy, EltMask);
1581   if (DstSize == 64)
1582     Mask = MIRBuilder.buildFNeg(VecTy, Mask);
1583 
1584   auto Sel = MIRBuilder.buildInstr(AArch64::G_BIT, {VecTy}, {Ins1, Ins2, Mask});
1585 
1586   // Build an unmerge whose 0th elt is the original G_FCOPYSIGN destination. We
1587   // want this to eventually become an EXTRACT_SUBREG.
1588   SmallVector<Register, 2> DstRegs(1, Dst);
1589   for (unsigned I = 1, E = VecTy.getNumElements(); I < E; ++I)
1590     DstRegs.push_back(MRI.createGenericVirtualRegister(DstTy));
1591   MIRBuilder.buildUnmerge(DstRegs, Sel);
1592   MI.eraseFromParent();
1593   return true;
1594 }
1595