xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AArch64.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AArch64LegalizerInfo.h"
15 #include "AArch64Subtarget.h"
16 #include "llvm/ADT/STLExtras.h"
17 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
18 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
19 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
20 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
21 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
22 #include "llvm/CodeGen/GlobalISel/Utils.h"
23 #include "llvm/CodeGen/MachineInstr.h"
24 #include "llvm/CodeGen/MachineRegisterInfo.h"
25 #include "llvm/CodeGen/TargetOpcodes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/Intrinsics.h"
28 #include "llvm/IR/IntrinsicsAArch64.h"
29 #include "llvm/IR/Type.h"
30 #include "llvm/Support/MathExtras.h"
31 #include <initializer_list>
32 
33 #define DEBUG_TYPE "aarch64-legalinfo"
34 
35 using namespace llvm;
36 using namespace LegalizeActions;
37 using namespace LegalizeMutations;
38 using namespace LegalityPredicates;
39 using namespace MIPatternMatch;
40 
AArch64LegalizerInfo(const AArch64Subtarget & ST)41 AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
42     : ST(&ST) {
43   using namespace TargetOpcode;
44   const LLT p0 = LLT::pointer(0, 64);
45   const LLT s8 = LLT::scalar(8);
46   const LLT s16 = LLT::scalar(16);
47   const LLT s32 = LLT::scalar(32);
48   const LLT s64 = LLT::scalar(64);
49   const LLT s128 = LLT::scalar(128);
50   const LLT v16s8 = LLT::fixed_vector(16, 8);
51   const LLT v8s8 = LLT::fixed_vector(8, 8);
52   const LLT v4s8 = LLT::fixed_vector(4, 8);
53   const LLT v2s8 = LLT::fixed_vector(2, 8);
54   const LLT v8s16 = LLT::fixed_vector(8, 16);
55   const LLT v4s16 = LLT::fixed_vector(4, 16);
56   const LLT v2s16 = LLT::fixed_vector(2, 16);
57   const LLT v2s32 = LLT::fixed_vector(2, 32);
58   const LLT v4s32 = LLT::fixed_vector(4, 32);
59   const LLT v2s64 = LLT::fixed_vector(2, 64);
60   const LLT v2p0 = LLT::fixed_vector(2, p0);
61 
62   const LLT nxv16s8 = LLT::scalable_vector(16, s8);
63   const LLT nxv8s16 = LLT::scalable_vector(8, s16);
64   const LLT nxv4s32 = LLT::scalable_vector(4, s32);
65   const LLT nxv2s64 = LLT::scalable_vector(2, s64);
66 
67   std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */
68                                                         v16s8, v8s16, v4s32,
69                                                         v2s64, v2p0,
70                                                         /* End 128bit types */
71                                                         /* Begin 64bit types */
72                                                         v8s8, v4s16, v2s32};
73   std::initializer_list<LLT> ScalarAndPtrTypesList = {s8, s16, s32, s64, p0};
74   SmallVector<LLT, 8> PackedVectorAllTypesVec(PackedVectorAllTypeList);
75   SmallVector<LLT, 8> ScalarAndPtrTypesVec(ScalarAndPtrTypesList);
76 
77   const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine();
78 
79   // FIXME: support subtargets which have neon/fp-armv8 disabled.
80   if (!ST.hasNEON() || !ST.hasFPARMv8()) {
81     getLegacyLegalizerInfo().computeTables();
82     return;
83   }
84 
85   // Some instructions only support s16 if the subtarget has full 16-bit FP
86   // support.
87   const bool HasFP16 = ST.hasFullFP16();
88   const LLT &MinFPScalar = HasFP16 ? s16 : s32;
89 
90   const bool HasCSSC = ST.hasCSSC();
91   const bool HasRCPC3 = ST.hasRCPC3();
92   const bool HasSVE = ST.hasSVE();
93 
94   getActionDefinitionsBuilder(
95       {G_IMPLICIT_DEF, G_FREEZE, G_CONSTANT_FOLD_BARRIER})
96       .legalFor({p0, s8, s16, s32, s64})
97       .legalFor({v2s8, v4s8, v8s8, v16s8, v2s16, v4s16, v8s16, v2s32, v4s32,
98                  v2s64, v2p0})
99       .widenScalarToNextPow2(0)
100       .clampScalar(0, s8, s64)
101       .moreElementsToNextPow2(0)
102       .widenVectorEltsToVectorMinSize(0, 64)
103       .clampNumElements(0, v8s8, v16s8)
104       .clampNumElements(0, v4s16, v8s16)
105       .clampNumElements(0, v2s32, v4s32)
106       .clampMaxNumElements(0, s64, 2)
107       .clampMaxNumElements(0, p0, 2)
108       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
109 
110   getActionDefinitionsBuilder(G_PHI)
111       .legalFor({p0, s16, s32, s64})
112       .legalFor(PackedVectorAllTypeList)
113       .widenScalarToNextPow2(0)
114       .moreElementsToNextPow2(0)
115       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
116       .clampScalar(0, s16, s64)
117       .clampNumElements(0, v8s8, v16s8)
118       .clampNumElements(0, v4s16, v8s16)
119       .clampNumElements(0, v2s32, v4s32)
120       .clampMaxNumElements(0, s64, 2)
121       .clampMaxNumElements(0, p0, 2);
122 
123   getActionDefinitionsBuilder(G_INSERT)
124       .legalIf(all(typeInSet(0, {s32, s64, p0}), typeInSet(1, {s8, s16, s32}),
125                    smallerThan(1, 0)))
126       .widenScalarToNextPow2(0)
127       .clampScalar(0, s32, s64)
128       .widenScalarToNextPow2(1)
129       .minScalar(1, s8)
130       .maxScalarIf(typeInSet(0, {s32}), 1, s16)
131       .maxScalarIf(typeInSet(0, {s64, p0}), 1, s32);
132 
133   getActionDefinitionsBuilder(G_EXTRACT)
134       .legalIf(all(typeInSet(0, {s16, s32, s64, p0}),
135                    typeInSet(1, {s32, s64, s128, p0}), smallerThan(0, 1)))
136       .widenScalarToNextPow2(1)
137       .clampScalar(1, s32, s128)
138       .widenScalarToNextPow2(0)
139       .minScalar(0, s16)
140       .maxScalarIf(typeInSet(1, {s32}), 0, s16)
141       .maxScalarIf(typeInSet(1, {s64, p0}), 0, s32)
142       .maxScalarIf(typeInSet(1, {s128}), 0, s64);
143 
144   getActionDefinitionsBuilder({G_ADD, G_SUB, G_AND, G_OR, G_XOR})
145       .legalFor({s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32, v2s64})
146       .legalFor(HasSVE, {nxv16s8, nxv8s16, nxv4s32, nxv2s64})
147       .widenScalarToNextPow2(0)
148       .clampScalar(0, s32, s64)
149       .clampMaxNumElements(0, s8, 16)
150       .clampMaxNumElements(0, s16, 8)
151       .clampNumElements(0, v2s32, v4s32)
152       .clampNumElements(0, v2s64, v2s64)
153       .minScalarOrEltIf(
154           [=](const LegalityQuery &Query) {
155             return Query.Types[0].getNumElements() <= 2;
156           },
157           0, s32)
158       .minScalarOrEltIf(
159           [=](const LegalityQuery &Query) {
160             return Query.Types[0].getNumElements() <= 4;
161           },
162           0, s16)
163       .minScalarOrEltIf(
164           [=](const LegalityQuery &Query) {
165             return Query.Types[0].getNumElements() <= 16;
166           },
167           0, s8)
168       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
169       .moreElementsToNextPow2(0);
170 
171   getActionDefinitionsBuilder(G_MUL)
172       .legalFor({s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32, v2s64})
173       .widenScalarToNextPow2(0)
174       .clampScalar(0, s32, s64)
175       .clampMaxNumElements(0, s8, 16)
176       .clampMaxNumElements(0, s16, 8)
177       .clampNumElements(0, v2s32, v4s32)
178       .clampNumElements(0, v2s64, v2s64)
179       .minScalarOrEltIf(
180           [=](const LegalityQuery &Query) {
181             return Query.Types[0].getNumElements() <= 2;
182           },
183           0, s32)
184       .minScalarOrEltIf(
185           [=](const LegalityQuery &Query) {
186             return Query.Types[0].getNumElements() <= 4;
187           },
188           0, s16)
189       .minScalarOrEltIf(
190           [=](const LegalityQuery &Query) {
191             return Query.Types[0].getNumElements() <= 16;
192           },
193           0, s8)
194       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
195       .moreElementsToNextPow2(0);
196 
197   getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR})
198       .customIf([=](const LegalityQuery &Query) {
199         const auto &SrcTy = Query.Types[0];
200         const auto &AmtTy = Query.Types[1];
201         return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
202                AmtTy.getSizeInBits() == 32;
203       })
204       .legalFor({
205           {s32, s32},
206           {s32, s64},
207           {s64, s64},
208           {v8s8, v8s8},
209           {v16s8, v16s8},
210           {v4s16, v4s16},
211           {v8s16, v8s16},
212           {v2s32, v2s32},
213           {v4s32, v4s32},
214           {v2s64, v2s64},
215       })
216       .widenScalarToNextPow2(0)
217       .clampScalar(1, s32, s64)
218       .clampScalar(0, s32, s64)
219       .clampNumElements(0, v8s8, v16s8)
220       .clampNumElements(0, v4s16, v8s16)
221       .clampNumElements(0, v2s32, v4s32)
222       .clampNumElements(0, v2s64, v2s64)
223       .moreElementsToNextPow2(0)
224       .minScalarSameAs(1, 0)
225       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
226 
227   getActionDefinitionsBuilder(G_PTR_ADD)
228       .legalFor({{p0, s64}, {v2p0, v2s64}})
229       .clampScalarOrElt(1, s64, s64)
230       .clampNumElements(0, v2p0, v2p0);
231 
232   getActionDefinitionsBuilder(G_PTRMASK).legalFor({{p0, s64}});
233 
234   getActionDefinitionsBuilder({G_SDIV, G_UDIV})
235       .legalFor({s32, s64})
236       .libcallFor({s128})
237       .clampScalar(0, s32, s64)
238       .widenScalarToNextPow2(0)
239       .scalarize(0);
240 
241   getActionDefinitionsBuilder({G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
242       .lowerFor({s8, s16, s32, s64, v2s32, v4s32, v2s64})
243       .libcallFor({s128})
244       .widenScalarOrEltToNextPow2(0)
245       .minScalarOrElt(0, s32)
246       .clampNumElements(0, v2s32, v4s32)
247       .clampNumElements(0, v2s64, v2s64)
248       .scalarize(0);
249 
250   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
251       .widenScalarToNextPow2(0, /*Min = */ 32)
252       .clampScalar(0, s32, s64)
253       .lower();
254 
255   getActionDefinitionsBuilder({G_SMULH, G_UMULH})
256       .legalFor({s64, v16s8, v8s16, v4s32})
257       .lower();
258 
259   getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
260       .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
261       .legalFor(HasCSSC, {s32, s64})
262       .minScalar(HasCSSC, 0, s32)
263       .clampNumElements(0, v8s8, v16s8)
264       .clampNumElements(0, v4s16, v8s16)
265       .clampNumElements(0, v2s32, v4s32)
266       .lower();
267 
268   // FIXME: Legal vector types are only legal with NEON.
269   getActionDefinitionsBuilder(G_ABS)
270       .legalFor(HasCSSC, {s32, s64})
271       .legalFor(PackedVectorAllTypeList)
272       .customIf([=](const LegalityQuery &Q) {
273         // TODO: Fix suboptimal codegen for 128+ bit types.
274         LLT SrcTy = Q.Types[0];
275         return SrcTy.isScalar() && SrcTy.getSizeInBits() < 128;
276       })
277       .widenScalarIf(
278           [=](const LegalityQuery &Query) { return Query.Types[0] == v4s8; },
279           [=](const LegalityQuery &Query) { return std::make_pair(0, v4s16); })
280       .widenScalarIf(
281           [=](const LegalityQuery &Query) { return Query.Types[0] == v2s16; },
282           [=](const LegalityQuery &Query) { return std::make_pair(0, v2s32); })
283       .clampNumElements(0, v8s8, v16s8)
284       .clampNumElements(0, v4s16, v8s16)
285       .clampNumElements(0, v2s32, v4s32)
286       .clampNumElements(0, v2s64, v2s64)
287       .moreElementsToNextPow2(0)
288       .lower();
289 
290   getActionDefinitionsBuilder(
291       {G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO})
292       .legalFor({{s32, s32}, {s64, s32}})
293       .clampScalar(0, s32, s64)
294       .clampScalar(1, s32, s64)
295       .widenScalarToNextPow2(0);
296 
297   getActionDefinitionsBuilder({G_FSHL, G_FSHR})
298       .customFor({{s32, s32}, {s32, s64}, {s64, s64}})
299       .lower();
300 
301   getActionDefinitionsBuilder(G_ROTR)
302       .legalFor({{s32, s64}, {s64, s64}})
303       .customIf([=](const LegalityQuery &Q) {
304         return Q.Types[0].isScalar() && Q.Types[1].getScalarSizeInBits() < 64;
305       })
306       .lower();
307   getActionDefinitionsBuilder(G_ROTL).lower();
308 
309   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
310       .customFor({{s32, s32}, {s64, s64}});
311 
312   auto always = [=](const LegalityQuery &Q) { return true; };
313   getActionDefinitionsBuilder(G_CTPOP)
314       .legalFor(HasCSSC, {{s32, s32}, {s64, s64}})
315       .legalFor({{v8s8, v8s8}, {v16s8, v16s8}})
316       .customFor(!HasCSSC, {{s32, s32}, {s64, s64}})
317       .customFor({{s128, s128},
318                   {v4s16, v4s16},
319                   {v8s16, v8s16},
320                   {v2s32, v2s32},
321                   {v4s32, v4s32},
322                   {v2s64, v2s64}})
323       .clampScalar(0, s32, s128)
324       .widenScalarToNextPow2(0)
325       .minScalarEltSameAsIf(always, 1, 0)
326       .maxScalarEltSameAsIf(always, 1, 0)
327       .clampNumElements(0, v8s8, v16s8)
328       .clampNumElements(0, v4s16, v8s16)
329       .clampNumElements(0, v2s32, v4s32)
330       .clampNumElements(0, v2s64, v2s64)
331       .moreElementsToNextPow2(0)
332       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
333 
334   getActionDefinitionsBuilder(G_CTLZ)
335       .legalFor({{s32, s32},
336                  {s64, s64},
337                  {v8s8, v8s8},
338                  {v16s8, v16s8},
339                  {v4s16, v4s16},
340                  {v8s16, v8s16},
341                  {v2s32, v2s32},
342                  {v4s32, v4s32}})
343       .widenScalarToNextPow2(1, /*Min=*/32)
344       .clampScalar(1, s32, s64)
345       .clampNumElements(0, v8s8, v16s8)
346       .clampNumElements(0, v4s16, v8s16)
347       .clampNumElements(0, v2s32, v4s32)
348       .moreElementsToNextPow2(0)
349       .scalarizeIf(scalarOrEltWiderThan(0, 32), 0)
350       .scalarSameSizeAs(0, 1);
351 
352   getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).lower();
353 
354   getActionDefinitionsBuilder(G_CTTZ)
355       .lowerIf(isVector(0))
356       .widenScalarToNextPow2(1, /*Min=*/32)
357       .clampScalar(1, s32, s64)
358       .scalarSameSizeAs(0, 1)
359       .legalFor(HasCSSC, {s32, s64})
360       .customFor(!HasCSSC, {s32, s64});
361 
362   getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF).lower();
363 
364   getActionDefinitionsBuilder(G_BITREVERSE)
365       .legalFor({s32, s64, v8s8, v16s8})
366       .widenScalarToNextPow2(0, /*Min = */ 32)
367       .widenScalarOrEltToNextPow2OrMinSize(0, 8)
368       .clampScalar(0, s32, s64)
369       .clampNumElements(0, v8s8, v16s8)
370       .clampNumElements(0, v4s16, v8s16)
371       .clampNumElements(0, v2s32, v4s32)
372       .clampNumElements(0, v2s64, v2s64)
373       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
374       .moreElementsToNextPow2(0)
375       .lower();
376 
377   getActionDefinitionsBuilder(G_BSWAP)
378       .legalFor({s32, s64, v4s16, v8s16, v2s32, v4s32, v2s64})
379       .widenScalarOrEltToNextPow2(0, 16)
380       .clampScalar(0, s32, s64)
381       .clampNumElements(0, v4s16, v8s16)
382       .clampNumElements(0, v2s32, v4s32)
383       .clampNumElements(0, v2s64, v2s64)
384       .moreElementsToNextPow2(0);
385 
386   getActionDefinitionsBuilder({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT})
387       .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32, v2s64})
388       .legalFor(HasSVE, {nxv16s8, nxv8s16, nxv4s32, nxv2s64})
389       .clampNumElements(0, v8s8, v16s8)
390       .clampNumElements(0, v4s16, v8s16)
391       .clampNumElements(0, v2s32, v4s32)
392       .clampMaxNumElements(0, s64, 2)
393       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
394       .moreElementsToNextPow2(0)
395       .lower();
396 
397   getActionDefinitionsBuilder(
398       {G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FMA, G_FSQRT, G_FMAXNUM, G_FMINNUM,
399        G_FMAXIMUM, G_FMINIMUM, G_FCEIL, G_FFLOOR, G_FRINT, G_FNEARBYINT,
400        G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
401       .legalFor({s32, s64, v2s32, v4s32, v2s64})
402       .legalFor(HasFP16, {s16, v4s16, v8s16})
403       .libcallFor({s128})
404       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
405       .minScalarOrElt(0, MinFPScalar)
406       .clampNumElements(0, v4s16, v8s16)
407       .clampNumElements(0, v2s32, v4s32)
408       .clampNumElements(0, v2s64, v2s64)
409       .moreElementsToNextPow2(0);
410 
411   getActionDefinitionsBuilder({G_FABS, G_FNEG})
412       .legalFor({s32, s64, v2s32, v4s32, v2s64})
413       .legalFor(HasFP16, {s16, v4s16, v8s16})
414       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
415       .lowerIf(scalarOrEltWiderThan(0, 64))
416       .clampNumElements(0, v4s16, v8s16)
417       .clampNumElements(0, v2s32, v4s32)
418       .clampNumElements(0, v2s64, v2s64)
419       .moreElementsToNextPow2(0)
420       .lowerFor({s16, v4s16, v8s16});
421 
422   getActionDefinitionsBuilder(G_FREM)
423       .libcallFor({s32, s64, s128})
424       .minScalar(0, s32)
425       .scalarize(0);
426 
427   getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
428       .legalFor({{s64, MinFPScalar}, {s64, s32}, {s64, s64}})
429       .libcallFor({{s64, s128}})
430       .minScalarOrElt(1, MinFPScalar);
431 
432   getActionDefinitionsBuilder({G_FCOS, G_FSIN, G_FPOW, G_FLOG, G_FLOG2,
433                                G_FLOG10, G_FTAN, G_FEXP, G_FEXP2, G_FEXP10,
434                                G_FACOS, G_FASIN, G_FATAN, G_FATAN2, G_FCOSH,
435                                G_FSINH, G_FTANH})
436       // We need a call for these, so we always need to scalarize.
437       .scalarize(0)
438       // Regardless of FP16 support, widen 16-bit elements to 32-bits.
439       .minScalar(0, s32)
440       .libcallFor({s32, s64, s128});
441   getActionDefinitionsBuilder(G_FPOWI)
442       .scalarize(0)
443       .minScalar(0, s32)
444       .libcallFor({{s32, s32}, {s64, s32}, {s128, s32}});
445 
446   // TODO: Libcall support for s128.
447   // TODO: s16 should be legal with full FP16 support.
448   getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
449       .legalFor({{s64, s32}, {s64, s64}});
450 
451   // TODO: Custom legalization for mismatched types.
452   getActionDefinitionsBuilder(G_FCOPYSIGN)
453       .moreElementsIf(
454           [](const LegalityQuery &Query) { return Query.Types[0].isScalar(); },
455           [=](const LegalityQuery &Query) {
456             const LLT Ty = Query.Types[0];
457             return std::pair(0, LLT::fixed_vector(Ty == s16 ? 4 : 2, Ty));
458           })
459       .lower();
460 
461   getActionDefinitionsBuilder(G_FMAD).lower();
462 
463   for (unsigned Op : {G_SEXTLOAD, G_ZEXTLOAD}) {
464     auto &Actions =  getActionDefinitionsBuilder(Op);
465 
466     if (Op == G_SEXTLOAD)
467       Actions.lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered));
468 
469     // Atomics have zero extending behavior.
470     Actions
471       .legalForTypesWithMemDesc({{s32, p0, s8, 8},
472                                  {s32, p0, s16, 8},
473                                  {s32, p0, s32, 8},
474                                  {s64, p0, s8, 2},
475                                  {s64, p0, s16, 2},
476                                  {s64, p0, s32, 4},
477                                  {s64, p0, s64, 8},
478                                  {p0, p0, s64, 8},
479                                  {v2s32, p0, s64, 8}})
480       .widenScalarToNextPow2(0)
481       .clampScalar(0, s32, s64)
482       // TODO: We could support sum-of-pow2's but the lowering code doesn't know
483       //       how to do that yet.
484       .unsupportedIfMemSizeNotPow2()
485       // Lower anything left over into G_*EXT and G_LOAD
486       .lower();
487   }
488 
489   auto IsPtrVecPred = [=](const LegalityQuery &Query) {
490     const LLT &ValTy = Query.Types[0];
491     return ValTy.isPointerVector() && ValTy.getAddressSpace() == 0;
492   };
493 
494   getActionDefinitionsBuilder(G_LOAD)
495       .customIf([=](const LegalityQuery &Query) {
496         return HasRCPC3 && Query.Types[0] == s128 &&
497                Query.MMODescrs[0].Ordering == AtomicOrdering::Acquire;
498       })
499       .customIf([=](const LegalityQuery &Query) {
500         return Query.Types[0] == s128 &&
501                Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
502       })
503       .legalForTypesWithMemDesc({{s8, p0, s8, 8},
504                                  {s16, p0, s16, 8},
505                                  {s32, p0, s32, 8},
506                                  {s64, p0, s64, 8},
507                                  {p0, p0, s64, 8},
508                                  {s128, p0, s128, 8},
509                                  {v8s8, p0, s64, 8},
510                                  {v16s8, p0, s128, 8},
511                                  {v4s16, p0, s64, 8},
512                                  {v8s16, p0, s128, 8},
513                                  {v2s32, p0, s64, 8},
514                                  {v4s32, p0, s128, 8},
515                                  {v2s64, p0, s128, 8}})
516       // These extends are also legal
517       .legalForTypesWithMemDesc(
518           {{s32, p0, s8, 8}, {s32, p0, s16, 8}, {s64, p0, s32, 8}})
519       .legalForTypesWithMemDesc({
520           // SVE vscale x 128 bit base sizes
521           {nxv16s8, p0, nxv16s8, 8},
522           {nxv8s16, p0, nxv8s16, 8},
523           {nxv4s32, p0, nxv4s32, 8},
524           {nxv2s64, p0, nxv2s64, 8},
525       })
526       .widenScalarToNextPow2(0, /* MinSize = */ 8)
527       .clampMaxNumElements(0, s8, 16)
528       .clampMaxNumElements(0, s16, 8)
529       .clampMaxNumElements(0, s32, 4)
530       .clampMaxNumElements(0, s64, 2)
531       .clampMaxNumElements(0, p0, 2)
532       .lowerIfMemSizeNotByteSizePow2()
533       .clampScalar(0, s8, s64)
534       .narrowScalarIf(
535           [=](const LegalityQuery &Query) {
536             // Clamp extending load results to 32-bits.
537             return Query.Types[0].isScalar() &&
538                    Query.Types[0] != Query.MMODescrs[0].MemoryTy &&
539                    Query.Types[0].getSizeInBits() > 32;
540           },
541           changeTo(0, s32))
542       // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out
543       .bitcastIf(typeInSet(0, {v4s8}),
544                  [=](const LegalityQuery &Query) {
545                    const LLT VecTy = Query.Types[0];
546                    return std::pair(0, LLT::scalar(VecTy.getSizeInBits()));
547                  })
548       .customIf(IsPtrVecPred)
549       .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0)
550       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
551 
552   getActionDefinitionsBuilder(G_STORE)
553       .customIf([=](const LegalityQuery &Query) {
554         return HasRCPC3 && Query.Types[0] == s128 &&
555                Query.MMODescrs[0].Ordering == AtomicOrdering::Release;
556       })
557       .customIf([=](const LegalityQuery &Query) {
558         return Query.Types[0] == s128 &&
559                Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
560       })
561       .legalForTypesWithMemDesc(
562           {{s8, p0, s8, 8},     {s16, p0, s8, 8},  // truncstorei8 from s16
563            {s32, p0, s8, 8},                       // truncstorei8 from s32
564            {s64, p0, s8, 8},                       // truncstorei8 from s64
565            {s16, p0, s16, 8},   {s32, p0, s16, 8}, // truncstorei16 from s32
566            {s64, p0, s16, 8},                      // truncstorei16 from s64
567            {s32, p0, s8, 8},    {s32, p0, s16, 8},    {s32, p0, s32, 8},
568            {s64, p0, s64, 8},   {s64, p0, s32, 8}, // truncstorei32 from s64
569            {p0, p0, s64, 8},    {s128, p0, s128, 8},  {v16s8, p0, s128, 8},
570            {v8s8, p0, s64, 8},  {v4s16, p0, s64, 8},  {v8s16, p0, s128, 8},
571            {v2s32, p0, s64, 8}, {v4s32, p0, s128, 8}, {v2s64, p0, s128, 8}})
572       .legalForTypesWithMemDesc({
573           // SVE vscale x 128 bit base sizes
574           // TODO: Add nxv2p0. Consider bitcastIf.
575           //       See #92130
576           // https://github.com/llvm/llvm-project/pull/92130#discussion_r1616888461
577           {nxv16s8, p0, nxv16s8, 8},
578           {nxv8s16, p0, nxv8s16, 8},
579           {nxv4s32, p0, nxv4s32, 8},
580           {nxv2s64, p0, nxv2s64, 8},
581       })
582       .clampScalar(0, s8, s64)
583       .minScalarOrElt(0, s8)
584       .lowerIf([=](const LegalityQuery &Query) {
585         return Query.Types[0].isScalar() &&
586                Query.Types[0] != Query.MMODescrs[0].MemoryTy;
587       })
588       // Maximum: sN * k = 128
589       .clampMaxNumElements(0, s8, 16)
590       .clampMaxNumElements(0, s16, 8)
591       .clampMaxNumElements(0, s32, 4)
592       .clampMaxNumElements(0, s64, 2)
593       .clampMaxNumElements(0, p0, 2)
594       .lowerIfMemSizeNotPow2()
595       // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out
596       .bitcastIf(all(typeInSet(0, {v4s8}),
597                      LegalityPredicate([=](const LegalityQuery &Query) {
598                        return Query.Types[0].getSizeInBits() ==
599                               Query.MMODescrs[0].MemoryTy.getSizeInBits();
600                      })),
601                  [=](const LegalityQuery &Query) {
602                    const LLT VecTy = Query.Types[0];
603                    return std::pair(0, LLT::scalar(VecTy.getSizeInBits()));
604                  })
605       .customIf(IsPtrVecPred)
606       .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0)
607       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
608       .lower();
609 
610   getActionDefinitionsBuilder(G_INDEXED_STORE)
611       // Idx 0 == Ptr, Idx 1 == Val
612       // TODO: we can implement legalizations but as of now these are
613       // generated in a very specific way.
614       .legalForTypesWithMemDesc({
615           {p0, s8, s8, 8},
616           {p0, s16, s16, 8},
617           {p0, s32, s8, 8},
618           {p0, s32, s16, 8},
619           {p0, s32, s32, 8},
620           {p0, s64, s64, 8},
621           {p0, p0, p0, 8},
622           {p0, v8s8, v8s8, 8},
623           {p0, v16s8, v16s8, 8},
624           {p0, v4s16, v4s16, 8},
625           {p0, v8s16, v8s16, 8},
626           {p0, v2s32, v2s32, 8},
627           {p0, v4s32, v4s32, 8},
628           {p0, v2s64, v2s64, 8},
629           {p0, v2p0, v2p0, 8},
630           {p0, s128, s128, 8},
631       })
632       .unsupported();
633 
634   auto IndexedLoadBasicPred = [=](const LegalityQuery &Query) {
635     LLT LdTy = Query.Types[0];
636     LLT PtrTy = Query.Types[1];
637     if (!llvm::is_contained(PackedVectorAllTypesVec, LdTy) &&
638         !llvm::is_contained(ScalarAndPtrTypesVec, LdTy) && LdTy != s128)
639       return false;
640     if (PtrTy != p0)
641       return false;
642     return true;
643   };
644   getActionDefinitionsBuilder(G_INDEXED_LOAD)
645       .unsupportedIf(
646           atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered))
647       .legalIf(IndexedLoadBasicPred)
648       .unsupported();
649   getActionDefinitionsBuilder({G_INDEXED_SEXTLOAD, G_INDEXED_ZEXTLOAD})
650       .unsupportedIf(
651           atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered))
652       .legalIf(all(typeInSet(0, {s16, s32, s64}),
653                    LegalityPredicate([=](const LegalityQuery &Q) {
654                      LLT LdTy = Q.Types[0];
655                      LLT PtrTy = Q.Types[1];
656                      LLT MemTy = Q.MMODescrs[0].MemoryTy;
657                      if (PtrTy != p0)
658                        return false;
659                      if (LdTy == s16)
660                        return MemTy == s8;
661                      if (LdTy == s32)
662                        return MemTy == s8 || MemTy == s16;
663                      if (LdTy == s64)
664                        return MemTy == s8 || MemTy == s16 || MemTy == s32;
665                      return false;
666                    })))
667       .unsupported();
668 
669   // Constants
670   getActionDefinitionsBuilder(G_CONSTANT)
671       .legalFor({p0, s8, s16, s32, s64})
672       .widenScalarToNextPow2(0)
673       .clampScalar(0, s8, s64);
674   getActionDefinitionsBuilder(G_FCONSTANT)
675       .legalFor({s32, s64, s128})
676       .legalFor(HasFP16, {s16})
677       .clampScalar(0, MinFPScalar, s128);
678 
679   // FIXME: fix moreElementsToNextPow2
680   getActionDefinitionsBuilder(G_ICMP)
681       .legalFor({{s32, s32}, {s32, s64}, {s32, p0}})
682       .widenScalarOrEltToNextPow2(1)
683       .clampScalar(1, s32, s64)
684       .clampScalar(0, s32, s32)
685       .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
686       .minScalarEltSameAsIf(
687           [=](const LegalityQuery &Query) {
688             const LLT &Ty = Query.Types[0];
689             const LLT &SrcTy = Query.Types[1];
690             return Ty.isVector() && !SrcTy.isPointerVector() &&
691                    Ty.getElementType() != SrcTy.getElementType();
692           },
693           0, 1)
694       .minScalarOrEltIf(
695           [=](const LegalityQuery &Query) { return Query.Types[1] == v2s16; },
696           1, s32)
697       .minScalarOrEltIf(
698           [=](const LegalityQuery &Query) {
699             return Query.Types[1].isPointerVector();
700           },
701           0, s64)
702       .moreElementsToNextPow2(1)
703       .clampNumElements(1, v8s8, v16s8)
704       .clampNumElements(1, v4s16, v8s16)
705       .clampNumElements(1, v2s32, v4s32)
706       .clampNumElements(1, v2s64, v2s64)
707       .clampNumElements(1, v2p0, v2p0)
708       .customIf(isVector(0));
709 
710   getActionDefinitionsBuilder(G_FCMP)
711       .legalFor({{s32, s32},
712                  {s32, s64},
713                  {v4s32, v4s32},
714                  {v2s32, v2s32},
715                  {v2s64, v2s64}})
716       .legalFor(HasFP16, {{s32, s16}, {v4s16, v4s16}, {v8s16, v8s16}})
717       .widenScalarOrEltToNextPow2(1)
718       .clampScalar(0, s32, s32)
719       .minScalarOrElt(1, MinFPScalar)
720       .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
721       .minScalarEltSameAsIf(
722           [=](const LegalityQuery &Query) {
723             const LLT &Ty = Query.Types[0];
724             const LLT &SrcTy = Query.Types[1];
725             return Ty.isVector() && !SrcTy.isPointerVector() &&
726                    Ty.getElementType() != SrcTy.getElementType();
727           },
728           0, 1)
729       .clampNumElements(1, v4s16, v8s16)
730       .clampNumElements(1, v2s32, v4s32)
731       .clampMaxNumElements(1, s64, 2)
732       .moreElementsToNextPow2(1)
733       .libcallFor({{s32, s128}});
734 
735   // Extensions
736   auto ExtLegalFunc = [=](const LegalityQuery &Query) {
737     unsigned DstSize = Query.Types[0].getSizeInBits();
738 
739     // Handle legal vectors using legalFor
740     if (Query.Types[0].isVector())
741       return false;
742 
743     if (DstSize < 8 || DstSize >= 128 || !isPowerOf2_32(DstSize))
744       return false; // Extending to a scalar s128 needs narrowing.
745 
746     const LLT &SrcTy = Query.Types[1];
747 
748     // Make sure we fit in a register otherwise. Don't bother checking that
749     // the source type is below 128 bits. We shouldn't be allowing anything
750     // through which is wider than the destination in the first place.
751     unsigned SrcSize = SrcTy.getSizeInBits();
752     if (SrcSize < 8 || !isPowerOf2_32(SrcSize))
753       return false;
754 
755     return true;
756   };
757   getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
758       .legalIf(ExtLegalFunc)
759       .legalFor({{v8s16, v8s8}, {v4s32, v4s16}, {v2s64, v2s32}})
760       .clampScalar(0, s64, s64) // Just for s128, others are handled above.
761       .moreElementsToNextPow2(0)
762       .clampMaxNumElements(1, s8, 8)
763       .clampMaxNumElements(1, s16, 4)
764       .clampMaxNumElements(1, s32, 2)
765       // Tries to convert a large EXTEND into two smaller EXTENDs
766       .lowerIf([=](const LegalityQuery &Query) {
767         return (Query.Types[0].getScalarSizeInBits() >
768                 Query.Types[1].getScalarSizeInBits() * 2) &&
769                Query.Types[0].isVector() &&
770                (Query.Types[1].getScalarSizeInBits() == 8 ||
771                 Query.Types[1].getScalarSizeInBits() == 16);
772       })
773       .clampMinNumElements(1, s8, 8)
774       .clampMinNumElements(1, s16, 4)
775       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
776 
777   getActionDefinitionsBuilder(G_TRUNC)
778       .legalFor({{v8s8, v8s16}, {v4s16, v4s32}, {v2s32, v2s64}})
779       .moreElementsToNextPow2(0)
780       .clampMaxNumElements(0, s8, 8)
781       .clampMaxNumElements(0, s16, 4)
782       .clampMaxNumElements(0, s32, 2)
783       .minScalarOrEltIf(
784           [=](const LegalityQuery &Query) { return Query.Types[0].isVector(); },
785           0, s8)
786       .lowerIf([=](const LegalityQuery &Query) {
787         LLT DstTy = Query.Types[0];
788         LLT SrcTy = Query.Types[1];
789         return DstTy.isVector() && SrcTy.getSizeInBits() > 128 &&
790                DstTy.getScalarSizeInBits() * 2 <= SrcTy.getScalarSizeInBits();
791       })
792       .clampMinNumElements(0, s8, 8)
793       .clampMinNumElements(0, s16, 4)
794       .alwaysLegal();
795 
796   getActionDefinitionsBuilder(G_SEXT_INREG)
797       .legalFor({s32, s64})
798       .legalFor(PackedVectorAllTypeList)
799       .maxScalar(0, s64)
800       .clampNumElements(0, v8s8, v16s8)
801       .clampNumElements(0, v4s16, v8s16)
802       .clampNumElements(0, v2s32, v4s32)
803       .clampMaxNumElements(0, s64, 2)
804       .lower();
805 
806   // FP conversions
807   getActionDefinitionsBuilder(G_FPTRUNC)
808       .legalFor(
809           {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}})
810       .libcallFor({{s16, s128}, {s32, s128}, {s64, s128}})
811       .clampNumElements(0, v4s16, v4s16)
812       .clampNumElements(0, v2s32, v2s32)
813       .scalarize(0);
814 
815   getActionDefinitionsBuilder(G_FPEXT)
816       .legalFor(
817           {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}})
818       .libcallFor({{s128, s64}, {s128, s32}, {s128, s16}})
819       .clampNumElements(0, v4s32, v4s32)
820       .clampNumElements(0, v2s64, v2s64)
821       .scalarize(0);
822 
823   // Conversions
824   getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
825       .legalFor({{s32, s32},
826                  {s64, s32},
827                  {s32, s64},
828                  {s64, s64},
829                  {v2s32, v2s32},
830                  {v4s32, v4s32},
831                  {v2s64, v2s64}})
832       .legalFor(HasFP16,
833                 {{s32, s16}, {s64, s16}, {v4s16, v4s16}, {v8s16, v8s16}})
834       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
835       .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
836       // The range of a fp16 value fits into an i17, so we can lower the width
837       // to i64.
838       .narrowScalarIf(
839           [=](const LegalityQuery &Query) {
840             return Query.Types[1] == s16 && Query.Types[0].getSizeInBits() > 64;
841           },
842           changeTo(0, s64))
843       .moreElementsToNextPow2(0)
844       .widenScalarOrEltToNextPow2OrMinSize(0)
845       .minScalar(0, s32)
846       .widenScalarOrEltToNextPow2OrMinSize(1, /*MinSize=*/HasFP16 ? 16 : 32)
847       .widenScalarIf(
848           [=](const LegalityQuery &Query) {
849             return Query.Types[0].getScalarSizeInBits() <= 64 &&
850                    Query.Types[0].getScalarSizeInBits() >
851                        Query.Types[1].getScalarSizeInBits();
852           },
853           LegalizeMutations::changeElementSizeTo(1, 0))
854       .widenScalarIf(
855           [=](const LegalityQuery &Query) {
856             return Query.Types[1].getScalarSizeInBits() <= 64 &&
857                    Query.Types[0].getScalarSizeInBits() <
858                        Query.Types[1].getScalarSizeInBits();
859           },
860           LegalizeMutations::changeElementSizeTo(0, 1))
861       .clampNumElements(0, v4s16, v8s16)
862       .clampNumElements(0, v2s32, v4s32)
863       .clampMaxNumElements(0, s64, 2)
864       .libcallFor(
865           {{s32, s128}, {s64, s128}, {s128, s128}, {s128, s32}, {s128, s64}});
866 
867   getActionDefinitionsBuilder({G_FPTOSI_SAT, G_FPTOUI_SAT})
868       .legalFor({{s32, s32},
869                  {s64, s32},
870                  {s32, s64},
871                  {s64, s64},
872                  {v2s32, v2s32},
873                  {v4s32, v4s32},
874                  {v2s64, v2s64}})
875       .legalFor(HasFP16,
876                 {{s32, s16}, {s64, s16}, {v4s16, v4s16}, {v8s16, v8s16}})
877       // Handle types larger than i64 by scalarizing/lowering.
878       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
879       .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
880       // The range of a fp16 value fits into an i17, so we can lower the width
881       // to i64.
882       .narrowScalarIf(
883           [=](const LegalityQuery &Query) {
884             return Query.Types[1] == s16 && Query.Types[0].getSizeInBits() > 64;
885           },
886           changeTo(0, s64))
887       .lowerIf(::any(scalarWiderThan(0, 64), scalarWiderThan(1, 64)), 0)
888       .moreElementsToNextPow2(0)
889       .widenScalarToNextPow2(0, /*MinSize=*/32)
890       .minScalar(0, s32)
891       .widenScalarOrEltToNextPow2OrMinSize(1, /*MinSize=*/HasFP16 ? 16 : 32)
892       .widenScalarIf(
893           [=](const LegalityQuery &Query) {
894             unsigned ITySize = Query.Types[0].getScalarSizeInBits();
895             return (ITySize == 16 || ITySize == 32 || ITySize == 64) &&
896                    ITySize > Query.Types[1].getScalarSizeInBits();
897           },
898           LegalizeMutations::changeElementSizeTo(1, 0))
899       .widenScalarIf(
900           [=](const LegalityQuery &Query) {
901             unsigned FTySize = Query.Types[1].getScalarSizeInBits();
902             return (FTySize == 16 || FTySize == 32 || FTySize == 64) &&
903                    Query.Types[0].getScalarSizeInBits() < FTySize;
904           },
905           LegalizeMutations::changeElementSizeTo(0, 1))
906       .widenScalarOrEltToNextPow2(0)
907       .clampNumElements(0, v4s16, v8s16)
908       .clampNumElements(0, v2s32, v4s32)
909       .clampMaxNumElements(0, s64, 2);
910 
911   getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
912       .legalFor({{s32, s32},
913                  {s64, s32},
914                  {s32, s64},
915                  {s64, s64},
916                  {v2s32, v2s32},
917                  {v4s32, v4s32},
918                  {v2s64, v2s64}})
919       .legalFor(HasFP16,
920                 {{s16, s32}, {s16, s64}, {v4s16, v4s16}, {v8s16, v8s16}})
921       .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
922       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
923       .moreElementsToNextPow2(1)
924       .widenScalarOrEltToNextPow2OrMinSize(1)
925       .minScalar(1, s32)
926       .lowerIf([](const LegalityQuery &Query) {
927         return Query.Types[1].isVector() &&
928                Query.Types[1].getScalarSizeInBits() == 64 &&
929                Query.Types[0].getScalarSizeInBits() == 16;
930       })
931       .widenScalarOrEltToNextPow2OrMinSize(0, /*MinSize=*/HasFP16 ? 16 : 32)
932       .scalarizeIf(
933           // v2i64->v2f32 needs to scalarize to avoid double-rounding issues.
934           [](const LegalityQuery &Query) {
935             return Query.Types[0].getScalarSizeInBits() == 32 &&
936                    Query.Types[1].getScalarSizeInBits() == 64;
937           },
938           0)
939       .widenScalarIf(
940           [](const LegalityQuery &Query) {
941             return Query.Types[1].getScalarSizeInBits() <= 64 &&
942                    Query.Types[0].getScalarSizeInBits() <
943                        Query.Types[1].getScalarSizeInBits();
944           },
945           LegalizeMutations::changeElementSizeTo(0, 1))
946       .widenScalarIf(
947           [](const LegalityQuery &Query) {
948             return Query.Types[0].getScalarSizeInBits() <= 64 &&
949                    Query.Types[0].getScalarSizeInBits() >
950                        Query.Types[1].getScalarSizeInBits();
951           },
952           LegalizeMutations::changeElementSizeTo(1, 0))
953       .clampNumElements(0, v4s16, v8s16)
954       .clampNumElements(0, v2s32, v4s32)
955       .clampMaxNumElements(0, s64, 2)
956       .libcallFor({{s16, s128},
957                    {s32, s128},
958                    {s64, s128},
959                    {s128, s128},
960                    {s128, s32},
961                    {s128, s64}});
962 
963   // Control-flow
964   getActionDefinitionsBuilder(G_BRCOND)
965     .legalFor({s32})
966     .clampScalar(0, s32, s32);
967   getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0});
968 
969   getActionDefinitionsBuilder(G_SELECT)
970       .legalFor({{s32, s32}, {s64, s32}, {p0, s32}})
971       .widenScalarToNextPow2(0)
972       .clampScalar(0, s32, s64)
973       .clampScalar(1, s32, s32)
974       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
975       .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0)
976       .lowerIf(isVector(0));
977 
978   // Pointer-handling
979   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
980 
981   if (TM.getCodeModel() == CodeModel::Small)
982     getActionDefinitionsBuilder(G_GLOBAL_VALUE).custom();
983   else
984     getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
985 
986   getActionDefinitionsBuilder(G_PTRAUTH_GLOBAL_VALUE)
987       .legalIf(all(typeIs(0, p0), typeIs(1, p0)));
988 
989   getActionDefinitionsBuilder(G_PTRTOINT)
990       .legalFor({{s64, p0}, {v2s64, v2p0}})
991       .widenScalarToNextPow2(0, 64)
992       .clampScalar(0, s64, s64)
993       .clampMaxNumElements(0, s64, 2);
994 
995   getActionDefinitionsBuilder(G_INTTOPTR)
996       .unsupportedIf([&](const LegalityQuery &Query) {
997         return Query.Types[0].getSizeInBits() != Query.Types[1].getSizeInBits();
998       })
999       .legalFor({{p0, s64}, {v2p0, v2s64}})
1000       .clampMaxNumElements(1, s64, 2);
1001 
1002   // Casts for 32 and 64-bit width type are just copies.
1003   // Same for 128-bit width type, except they are on the FPR bank.
1004   getActionDefinitionsBuilder(G_BITCAST)
1005       // Keeping 32-bit instructions legal to prevent regression in some tests
1006       .legalForCartesianProduct({s32, v2s16, v4s8})
1007       .legalForCartesianProduct({s64, v8s8, v4s16, v2s32})
1008       .legalForCartesianProduct({s128, v16s8, v8s16, v4s32, v2s64, v2p0})
1009       .customIf([=](const LegalityQuery &Query) {
1010         // Handle casts from i1 vectors to scalars.
1011         LLT DstTy = Query.Types[0];
1012         LLT SrcTy = Query.Types[1];
1013         return DstTy.isScalar() && SrcTy.isVector() &&
1014                SrcTy.getScalarSizeInBits() == 1;
1015       })
1016       .lowerIf([=](const LegalityQuery &Query) {
1017         return Query.Types[0].isVector() != Query.Types[1].isVector();
1018       })
1019       .moreElementsToNextPow2(0)
1020       .clampNumElements(0, v8s8, v16s8)
1021       .clampNumElements(0, v4s16, v8s16)
1022       .clampNumElements(0, v2s32, v4s32)
1023       .lower();
1024 
1025   getActionDefinitionsBuilder(G_VASTART).legalFor({p0});
1026 
1027   // va_list must be a pointer, but most sized types are pretty easy to handle
1028   // as the destination.
1029   getActionDefinitionsBuilder(G_VAARG)
1030       .customForCartesianProduct({s8, s16, s32, s64, p0}, {p0})
1031       .clampScalar(0, s8, s64)
1032       .widenScalarToNextPow2(0, /*Min*/ 8);
1033 
1034   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
1035       .lowerIf(
1036           all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0)));
1037 
1038   bool UseOutlineAtomics = ST.outlineAtomics() && !ST.hasLSE();
1039 
1040   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1041       .legalFor(!UseOutlineAtomics, {{s32, p0}, {s64, p0}})
1042       .customFor(!UseOutlineAtomics, {{s128, p0}})
1043       .libcallFor(UseOutlineAtomics,
1044                   {{s8, p0}, {s16, p0}, {s32, p0}, {s64, p0}, {s128, p0}})
1045       .clampScalar(0, s32, s64);
1046 
1047   getActionDefinitionsBuilder({G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD,
1048                                G_ATOMICRMW_SUB, G_ATOMICRMW_AND, G_ATOMICRMW_OR,
1049                                G_ATOMICRMW_XOR})
1050       .legalFor(!UseOutlineAtomics, {{s32, p0}, {s64, p0}})
1051       .libcallFor(UseOutlineAtomics,
1052                   {{s8, p0}, {s16, p0}, {s32, p0}, {s64, p0}})
1053       .clampScalar(0, s32, s64);
1054 
1055   // Do not outline these atomics operations, as per comment in
1056   // AArch64ISelLowering.cpp's shouldExpandAtomicRMWInIR().
1057   getActionDefinitionsBuilder(
1058       {G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX})
1059       .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)))
1060       .clampScalar(0, s32, s64);
1061 
1062   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({p0});
1063 
1064   // Merge/Unmerge
1065   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1066     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1067     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1068     getActionDefinitionsBuilder(Op)
1069         .widenScalarToNextPow2(LitTyIdx, 8)
1070         .widenScalarToNextPow2(BigTyIdx, 32)
1071         .clampScalar(LitTyIdx, s8, s64)
1072         .clampScalar(BigTyIdx, s32, s128)
1073         .legalIf([=](const LegalityQuery &Q) {
1074           switch (Q.Types[BigTyIdx].getSizeInBits()) {
1075           case 32:
1076           case 64:
1077           case 128:
1078             break;
1079           default:
1080             return false;
1081           }
1082           switch (Q.Types[LitTyIdx].getSizeInBits()) {
1083           case 8:
1084           case 16:
1085           case 32:
1086           case 64:
1087             return true;
1088           default:
1089             return false;
1090           }
1091         });
1092   }
1093 
1094   // TODO : nxv4s16, nxv2s16, nxv2s32
1095   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1096       .legalFor(HasSVE, {{s16, nxv16s8, s64},
1097                          {s16, nxv8s16, s64},
1098                          {s32, nxv4s32, s64},
1099                          {s64, nxv2s64, s64}})
1100       .unsupportedIf([=](const LegalityQuery &Query) {
1101         const LLT &EltTy = Query.Types[1].getElementType();
1102         if (Query.Types[1].isScalableVector())
1103           return false;
1104         return Query.Types[0] != EltTy;
1105       })
1106       .minScalar(2, s64)
1107       .customIf([=](const LegalityQuery &Query) {
1108         const LLT &VecTy = Query.Types[1];
1109         return VecTy == v8s8 || VecTy == v16s8 || VecTy == v2s16 ||
1110                VecTy == v4s16 || VecTy == v8s16 || VecTy == v2s32 ||
1111                VecTy == v4s32 || VecTy == v2s64 || VecTy == v2p0;
1112       })
1113       .minScalarOrEltIf(
1114           [=](const LegalityQuery &Query) {
1115             // We want to promote to <M x s1> to <M x s64> if that wouldn't
1116             // cause the total vec size to be > 128b.
1117             return Query.Types[1].isFixedVector() &&
1118                    Query.Types[1].getNumElements() <= 2;
1119           },
1120           0, s64)
1121       .minScalarOrEltIf(
1122           [=](const LegalityQuery &Query) {
1123             return Query.Types[1].isFixedVector() &&
1124                    Query.Types[1].getNumElements() <= 4;
1125           },
1126           0, s32)
1127       .minScalarOrEltIf(
1128           [=](const LegalityQuery &Query) {
1129             return Query.Types[1].isFixedVector() &&
1130                    Query.Types[1].getNumElements() <= 8;
1131           },
1132           0, s16)
1133       .minScalarOrEltIf(
1134           [=](const LegalityQuery &Query) {
1135             return Query.Types[1].isFixedVector() &&
1136                    Query.Types[1].getNumElements() <= 16;
1137           },
1138           0, s8)
1139       .minScalarOrElt(0, s8) // Worst case, we need at least s8.
1140       .moreElementsToNextPow2(1)
1141       .clampMaxNumElements(1, s64, 2)
1142       .clampMaxNumElements(1, s32, 4)
1143       .clampMaxNumElements(1, s16, 8)
1144       .clampMaxNumElements(1, s8, 16)
1145       .clampMaxNumElements(1, p0, 2);
1146 
1147   getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT)
1148       .legalIf(
1149           typeInSet(0, {v8s8, v16s8, v4s16, v8s16, v2s32, v4s32, v2s64, v2p0}))
1150       .legalFor(HasSVE, {{nxv16s8, s32, s64},
1151                          {nxv8s16, s32, s64},
1152                          {nxv4s32, s32, s64},
1153                          {nxv2s64, s64, s64}})
1154       .moreElementsToNextPow2(0)
1155       .widenVectorEltsToVectorMinSize(0, 64)
1156       .clampNumElements(0, v8s8, v16s8)
1157       .clampNumElements(0, v4s16, v8s16)
1158       .clampNumElements(0, v2s32, v4s32)
1159       .clampMaxNumElements(0, s64, 2)
1160       .clampMaxNumElements(0, p0, 2);
1161 
1162   getActionDefinitionsBuilder(G_BUILD_VECTOR)
1163       .legalFor({{v8s8, s8},
1164                  {v16s8, s8},
1165                  {v4s16, s16},
1166                  {v8s16, s16},
1167                  {v2s32, s32},
1168                  {v4s32, s32},
1169                  {v2s64, s64},
1170                  {v2p0, p0}})
1171       .clampNumElements(0, v4s32, v4s32)
1172       .clampNumElements(0, v2s64, v2s64)
1173       .minScalarOrElt(0, s8)
1174       .widenVectorEltsToVectorMinSize(0, 64)
1175       .widenScalarOrEltToNextPow2(0)
1176       .minScalarSameAs(1, 0);
1177 
1178   getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC).lower();
1179 
1180   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1181       .legalIf([=](const LegalityQuery &Query) {
1182         const LLT &DstTy = Query.Types[0];
1183         const LLT &SrcTy = Query.Types[1];
1184         // For now just support the TBL2 variant which needs the source vectors
1185         // to be the same size as the dest.
1186         if (DstTy != SrcTy)
1187           return false;
1188         return llvm::is_contained(
1189             {v8s8, v16s8, v4s16, v8s16, v2s32, v4s32, v2s64}, DstTy);
1190       })
1191       // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors) or scalar
1192       // destinations, we just want those lowered into G_BUILD_VECTOR or
1193       // G_EXTRACT_ELEMENT.
1194       .lowerIf([=](const LegalityQuery &Query) {
1195         return !Query.Types[0].isVector() || !Query.Types[1].isVector();
1196       })
1197       .moreElementsIf(
1198           [](const LegalityQuery &Query) {
1199             return Query.Types[0].isVector() && Query.Types[1].isVector() &&
1200                    Query.Types[0].getNumElements() >
1201                        Query.Types[1].getNumElements();
1202           },
1203           changeTo(1, 0))
1204       .moreElementsToNextPow2(0)
1205       .moreElementsIf(
1206           [](const LegalityQuery &Query) {
1207             return Query.Types[0].isVector() && Query.Types[1].isVector() &&
1208                    Query.Types[0].getNumElements() <
1209                        Query.Types[1].getNumElements();
1210           },
1211           changeTo(0, 1))
1212       .widenScalarOrEltToNextPow2OrMinSize(0, 8)
1213       .clampNumElements(0, v8s8, v16s8)
1214       .clampNumElements(0, v4s16, v8s16)
1215       .clampNumElements(0, v4s32, v4s32)
1216       .clampNumElements(0, v2s64, v2s64)
1217       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
1218       .bitcastIf(isPointerVector(0), [=](const LegalityQuery &Query) {
1219         // Bitcast pointers vector to i64.
1220         const LLT DstTy = Query.Types[0];
1221         return std::pair(0, LLT::vector(DstTy.getElementCount(), 64));
1222       });
1223 
1224   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1225       .legalFor({{v16s8, v8s8}, {v8s16, v4s16}, {v4s32, v2s32}})
1226       .bitcastIf(
1227           [=](const LegalityQuery &Query) {
1228             return Query.Types[0].getSizeInBits() <= 128 &&
1229                    Query.Types[1].getSizeInBits() <= 64;
1230           },
1231           [=](const LegalityQuery &Query) {
1232             const LLT DstTy = Query.Types[0];
1233             const LLT SrcTy = Query.Types[1];
1234             return std::pair(
1235                 0, DstTy.changeElementSize(SrcTy.getSizeInBits())
1236                        .changeElementCount(
1237                            DstTy.getElementCount().divideCoefficientBy(
1238                                SrcTy.getNumElements())));
1239           });
1240 
1241   getActionDefinitionsBuilder(G_EXTRACT_SUBVECTOR)
1242       .legalFor({{v8s8, v16s8}, {v4s16, v8s16}, {v2s32, v4s32}})
1243       .widenScalarOrEltToNextPow2(0)
1244       .immIdx(0); // Inform verifier imm idx 0 is handled.
1245 
1246   // TODO: {nxv16s8, s8}, {nxv8s16, s16}
1247   getActionDefinitionsBuilder(G_SPLAT_VECTOR)
1248       .legalFor(HasSVE, {{nxv4s32, s32}, {nxv2s64, s64}});
1249 
1250   getActionDefinitionsBuilder(G_JUMP_TABLE).legalFor({p0});
1251 
1252   getActionDefinitionsBuilder(G_BRJT).legalFor({{p0, s64}});
1253 
1254   getActionDefinitionsBuilder(G_DYN_STACKALLOC).custom();
1255 
1256   getActionDefinitionsBuilder({G_STACKSAVE, G_STACKRESTORE}).lower();
1257 
1258   if (ST.hasMOPS()) {
1259     // G_BZERO is not supported. Currently it is only emitted by
1260     // PreLegalizerCombiner for G_MEMSET with zero constant.
1261     getActionDefinitionsBuilder(G_BZERO).unsupported();
1262 
1263     getActionDefinitionsBuilder(G_MEMSET)
1264         .legalForCartesianProduct({p0}, {s64}, {s64})
1265         .customForCartesianProduct({p0}, {s8}, {s64})
1266         .immIdx(0); // Inform verifier imm idx 0 is handled.
1267 
1268     getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE})
1269         .legalForCartesianProduct({p0}, {p0}, {s64})
1270         .immIdx(0); // Inform verifier imm idx 0 is handled.
1271 
1272     // G_MEMCPY_INLINE does not have a tailcall immediate
1273     getActionDefinitionsBuilder(G_MEMCPY_INLINE)
1274         .legalForCartesianProduct({p0}, {p0}, {s64});
1275 
1276   } else {
1277     getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET})
1278         .libcall();
1279   }
1280 
1281   // For fadd reductions we have pairwise operations available. We treat the
1282   // usual legal types as legal and handle the lowering to pairwise instructions
1283   // later.
1284   getActionDefinitionsBuilder(G_VECREDUCE_FADD)
1285       .legalFor({{s32, v2s32}, {s32, v4s32}, {s64, v2s64}})
1286       .legalFor(HasFP16, {{s16, v4s16}, {s16, v8s16}})
1287       .minScalarOrElt(0, MinFPScalar)
1288       .clampMaxNumElements(1, s64, 2)
1289       .clampMaxNumElements(1, s32, 4)
1290       .clampMaxNumElements(1, s16, 8)
1291       .moreElementsToNextPow2(1)
1292       .scalarize(1)
1293       .lower();
1294 
1295   // For fmul reductions we need to split up into individual operations. We
1296   // clamp to 128 bit vectors then to 64bit vectors to produce a cascade of
1297   // smaller types, followed by scalarizing what remains.
1298   getActionDefinitionsBuilder(G_VECREDUCE_FMUL)
1299       .minScalarOrElt(0, MinFPScalar)
1300       .clampMaxNumElements(1, s64, 2)
1301       .clampMaxNumElements(1, s32, 4)
1302       .clampMaxNumElements(1, s16, 8)
1303       .clampMaxNumElements(1, s32, 2)
1304       .clampMaxNumElements(1, s16, 4)
1305       .scalarize(1)
1306       .lower();
1307 
1308   getActionDefinitionsBuilder({G_VECREDUCE_SEQ_FADD, G_VECREDUCE_SEQ_FMUL})
1309       .scalarize(2)
1310       .lower();
1311 
1312   getActionDefinitionsBuilder(G_VECREDUCE_ADD)
1313       .legalFor({{s8, v8s8},
1314                  {s8, v16s8},
1315                  {s16, v4s16},
1316                  {s16, v8s16},
1317                  {s32, v2s32},
1318                  {s32, v4s32},
1319                  {s64, v2s64}})
1320       .moreElementsToNextPow2(1)
1321       .clampMaxNumElements(1, s64, 2)
1322       .clampMaxNumElements(1, s32, 4)
1323       .clampMaxNumElements(1, s16, 8)
1324       .clampMaxNumElements(1, s8, 16)
1325       .widenVectorEltsToVectorMinSize(1, 64)
1326       .scalarize(1);
1327 
1328   getActionDefinitionsBuilder({G_VECREDUCE_FMIN, G_VECREDUCE_FMAX,
1329                                G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM})
1330       .legalFor({{s32, v2s32}, {s32, v4s32}, {s64, v2s64}})
1331       .legalFor(HasFP16, {{s16, v4s16}, {s16, v8s16}})
1332       .minScalarOrElt(0, MinFPScalar)
1333       .clampMaxNumElements(1, s64, 2)
1334       .clampMaxNumElements(1, s32, 4)
1335       .clampMaxNumElements(1, s16, 8)
1336       .lower();
1337 
1338   getActionDefinitionsBuilder(G_VECREDUCE_MUL)
1339       .clampMaxNumElements(1, s32, 2)
1340       .clampMaxNumElements(1, s16, 4)
1341       .clampMaxNumElements(1, s8, 8)
1342       .scalarize(1)
1343       .lower();
1344 
1345   getActionDefinitionsBuilder(
1346       {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX})
1347       .legalFor({{s8, v8s8},
1348                  {s8, v16s8},
1349                  {s16, v4s16},
1350                  {s16, v8s16},
1351                  {s32, v2s32},
1352                  {s32, v4s32}})
1353       .moreElementsIf(
1354           [=](const LegalityQuery &Query) {
1355             return Query.Types[1].isVector() &&
1356                    Query.Types[1].getElementType() != s8 &&
1357                    Query.Types[1].getNumElements() & 1;
1358           },
1359           LegalizeMutations::moreElementsToNextPow2(1))
1360       .clampMaxNumElements(1, s64, 2)
1361       .clampMaxNumElements(1, s32, 4)
1362       .clampMaxNumElements(1, s16, 8)
1363       .clampMaxNumElements(1, s8, 16)
1364       .scalarize(1)
1365       .lower();
1366 
1367   getActionDefinitionsBuilder(
1368       {G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
1369       // Try to break down into smaller vectors as long as they're at least 64
1370       // bits. This lets us use vector operations for some parts of the
1371       // reduction.
1372       .fewerElementsIf(
1373           [=](const LegalityQuery &Q) {
1374             LLT SrcTy = Q.Types[1];
1375             if (SrcTy.isScalar())
1376               return false;
1377             if (!isPowerOf2_32(SrcTy.getNumElements()))
1378               return false;
1379             // We can usually perform 64b vector operations.
1380             return SrcTy.getSizeInBits() > 64;
1381           },
1382           [=](const LegalityQuery &Q) {
1383             LLT SrcTy = Q.Types[1];
1384             return std::make_pair(1, SrcTy.divide(2));
1385           })
1386       .scalarize(1)
1387       .lower();
1388 
1389   // TODO: Update this to correct handling when adding AArch64/SVE support.
1390   getActionDefinitionsBuilder(G_VECTOR_COMPRESS).lower();
1391 
1392   // Access to floating-point environment.
1393   getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV, G_RESET_FPENV,
1394                                G_GET_FPMODE, G_SET_FPMODE, G_RESET_FPMODE})
1395       .libcall();
1396 
1397   getActionDefinitionsBuilder(G_IS_FPCLASS).lower();
1398 
1399   getActionDefinitionsBuilder(G_PREFETCH).custom();
1400 
1401   getActionDefinitionsBuilder({G_SCMP, G_UCMP}).lower();
1402 
1403   getLegacyLegalizerInfo().computeTables();
1404   verify(*ST.getInstrInfo());
1405 }
1406 
legalizeCustom(LegalizerHelper & Helper,MachineInstr & MI,LostDebugLocObserver & LocObserver) const1407 bool AArch64LegalizerInfo::legalizeCustom(
1408     LegalizerHelper &Helper, MachineInstr &MI,
1409     LostDebugLocObserver &LocObserver) const {
1410   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1411   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
1412   GISelChangeObserver &Observer = Helper.Observer;
1413   switch (MI.getOpcode()) {
1414   default:
1415     // No idea what to do.
1416     return false;
1417   case TargetOpcode::G_VAARG:
1418     return legalizeVaArg(MI, MRI, MIRBuilder);
1419   case TargetOpcode::G_LOAD:
1420   case TargetOpcode::G_STORE:
1421     return legalizeLoadStore(MI, MRI, MIRBuilder, Observer);
1422   case TargetOpcode::G_SHL:
1423   case TargetOpcode::G_ASHR:
1424   case TargetOpcode::G_LSHR:
1425     return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer);
1426   case TargetOpcode::G_GLOBAL_VALUE:
1427     return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer);
1428   case TargetOpcode::G_SBFX:
1429   case TargetOpcode::G_UBFX:
1430     return legalizeBitfieldExtract(MI, MRI, Helper);
1431   case TargetOpcode::G_FSHL:
1432   case TargetOpcode::G_FSHR:
1433     return legalizeFunnelShift(MI, MRI, MIRBuilder, Observer, Helper);
1434   case TargetOpcode::G_ROTR:
1435     return legalizeRotate(MI, MRI, Helper);
1436   case TargetOpcode::G_CTPOP:
1437     return legalizeCTPOP(MI, MRI, Helper);
1438   case TargetOpcode::G_ATOMIC_CMPXCHG:
1439     return legalizeAtomicCmpxchg128(MI, MRI, Helper);
1440   case TargetOpcode::G_CTTZ:
1441     return legalizeCTTZ(MI, Helper);
1442   case TargetOpcode::G_BZERO:
1443   case TargetOpcode::G_MEMCPY:
1444   case TargetOpcode::G_MEMMOVE:
1445   case TargetOpcode::G_MEMSET:
1446     return legalizeMemOps(MI, Helper);
1447   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1448     return legalizeExtractVectorElt(MI, MRI, Helper);
1449   case TargetOpcode::G_DYN_STACKALLOC:
1450     return legalizeDynStackAlloc(MI, Helper);
1451   case TargetOpcode::G_PREFETCH:
1452     return legalizePrefetch(MI, Helper);
1453   case TargetOpcode::G_ABS:
1454     return Helper.lowerAbsToCNeg(MI);
1455   case TargetOpcode::G_ICMP:
1456     return legalizeICMP(MI, MRI, MIRBuilder);
1457   case TargetOpcode::G_BITCAST:
1458     return legalizeBitcast(MI, Helper);
1459   }
1460 
1461   llvm_unreachable("expected switch to return");
1462 }
1463 
legalizeBitcast(MachineInstr & MI,LegalizerHelper & Helper) const1464 bool AArch64LegalizerInfo::legalizeBitcast(MachineInstr &MI,
1465                                            LegalizerHelper &Helper) const {
1466   assert(MI.getOpcode() == TargetOpcode::G_BITCAST && "Unexpected opcode");
1467   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
1468   // We're trying to handle casts from i1 vectors to scalars but reloading from
1469   // stack.
1470   if (!DstTy.isScalar() || !SrcTy.isVector() ||
1471       SrcTy.getElementType() != LLT::scalar(1))
1472     return false;
1473 
1474   Helper.createStackStoreLoad(DstReg, SrcReg);
1475   MI.eraseFromParent();
1476   return true;
1477 }
1478 
legalizeFunnelShift(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & MIRBuilder,GISelChangeObserver & Observer,LegalizerHelper & Helper) const1479 bool AArch64LegalizerInfo::legalizeFunnelShift(MachineInstr &MI,
1480                                                MachineRegisterInfo &MRI,
1481                                                MachineIRBuilder &MIRBuilder,
1482                                                GISelChangeObserver &Observer,
1483                                                LegalizerHelper &Helper) const {
1484   assert(MI.getOpcode() == TargetOpcode::G_FSHL ||
1485          MI.getOpcode() == TargetOpcode::G_FSHR);
1486 
1487   // Keep as G_FSHR if shift amount is a G_CONSTANT, else use generic
1488   // lowering
1489   Register ShiftNo = MI.getOperand(3).getReg();
1490   LLT ShiftTy = MRI.getType(ShiftNo);
1491   auto VRegAndVal = getIConstantVRegValWithLookThrough(ShiftNo, MRI);
1492 
1493   // Adjust shift amount according to Opcode (FSHL/FSHR)
1494   // Convert FSHL to FSHR
1495   LLT OperationTy = MRI.getType(MI.getOperand(0).getReg());
1496   APInt BitWidth(ShiftTy.getSizeInBits(), OperationTy.getSizeInBits(), false);
1497 
1498   // Lower non-constant shifts and leave zero shifts to the optimizer.
1499   if (!VRegAndVal || VRegAndVal->Value.urem(BitWidth) == 0)
1500     return (Helper.lowerFunnelShiftAsShifts(MI) ==
1501             LegalizerHelper::LegalizeResult::Legalized);
1502 
1503   APInt Amount = VRegAndVal->Value.urem(BitWidth);
1504 
1505   Amount = MI.getOpcode() == TargetOpcode::G_FSHL ? BitWidth - Amount : Amount;
1506 
1507   // If the instruction is G_FSHR, has a 64-bit G_CONSTANT for shift amount
1508   // in the range of 0 <-> BitWidth, it is legal
1509   if (ShiftTy.getSizeInBits() == 64 && MI.getOpcode() == TargetOpcode::G_FSHR &&
1510       VRegAndVal->Value.ult(BitWidth))
1511     return true;
1512 
1513   // Cast the ShiftNumber to a 64-bit type
1514   auto Cast64 = MIRBuilder.buildConstant(LLT::scalar(64), Amount.zext(64));
1515 
1516   if (MI.getOpcode() == TargetOpcode::G_FSHR) {
1517     Observer.changingInstr(MI);
1518     MI.getOperand(3).setReg(Cast64.getReg(0));
1519     Observer.changedInstr(MI);
1520   }
1521   // If Opcode is FSHL, remove the FSHL instruction and create a FSHR
1522   // instruction
1523   else if (MI.getOpcode() == TargetOpcode::G_FSHL) {
1524     MIRBuilder.buildInstr(TargetOpcode::G_FSHR, {MI.getOperand(0).getReg()},
1525                           {MI.getOperand(1).getReg(), MI.getOperand(2).getReg(),
1526                            Cast64.getReg(0)});
1527     MI.eraseFromParent();
1528   }
1529   return true;
1530 }
1531 
legalizeICMP(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & MIRBuilder) const1532 bool AArch64LegalizerInfo::legalizeICMP(MachineInstr &MI,
1533                                         MachineRegisterInfo &MRI,
1534                                         MachineIRBuilder &MIRBuilder) const {
1535   Register DstReg = MI.getOperand(0).getReg();
1536   Register SrcReg1 = MI.getOperand(2).getReg();
1537   Register SrcReg2 = MI.getOperand(3).getReg();
1538   LLT DstTy = MRI.getType(DstReg);
1539   LLT SrcTy = MRI.getType(SrcReg1);
1540 
1541   // Check the vector types are legal
1542   if (DstTy.getScalarSizeInBits() != SrcTy.getScalarSizeInBits() ||
1543       DstTy.getNumElements() != SrcTy.getNumElements() ||
1544       (DstTy.getSizeInBits() != 64 && DstTy.getSizeInBits() != 128))
1545     return false;
1546 
1547   // Lowers G_ICMP NE => G_ICMP EQ to allow better pattern matching for
1548   // following passes
1549   CmpInst::Predicate Pred = (CmpInst::Predicate)MI.getOperand(1).getPredicate();
1550   if (Pred != CmpInst::ICMP_NE)
1551     return true;
1552   Register CmpReg =
1553       MIRBuilder
1554           .buildICmp(CmpInst::ICMP_EQ, MRI.getType(DstReg), SrcReg1, SrcReg2)
1555           .getReg(0);
1556   MIRBuilder.buildNot(DstReg, CmpReg);
1557 
1558   MI.eraseFromParent();
1559   return true;
1560 }
1561 
legalizeRotate(MachineInstr & MI,MachineRegisterInfo & MRI,LegalizerHelper & Helper) const1562 bool AArch64LegalizerInfo::legalizeRotate(MachineInstr &MI,
1563                                           MachineRegisterInfo &MRI,
1564                                           LegalizerHelper &Helper) const {
1565   // To allow for imported patterns to match, we ensure that the rotate amount
1566   // is 64b with an extension.
1567   Register AmtReg = MI.getOperand(2).getReg();
1568   LLT AmtTy = MRI.getType(AmtReg);
1569   (void)AmtTy;
1570   assert(AmtTy.isScalar() && "Expected a scalar rotate");
1571   assert(AmtTy.getSizeInBits() < 64 && "Expected this rotate to be legal");
1572   auto NewAmt = Helper.MIRBuilder.buildZExt(LLT::scalar(64), AmtReg);
1573   Helper.Observer.changingInstr(MI);
1574   MI.getOperand(2).setReg(NewAmt.getReg(0));
1575   Helper.Observer.changedInstr(MI);
1576   return true;
1577 }
1578 
legalizeSmallCMGlobalValue(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & MIRBuilder,GISelChangeObserver & Observer) const1579 bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(
1580     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
1581     GISelChangeObserver &Observer) const {
1582   assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE);
1583   // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP +
1584   // G_ADD_LOW instructions.
1585   // By splitting this here, we can optimize accesses in the small code model by
1586   // folding in the G_ADD_LOW into the load/store offset.
1587   auto &GlobalOp = MI.getOperand(1);
1588   // Don't modify an intrinsic call.
1589   if (GlobalOp.isSymbol())
1590     return true;
1591   const auto* GV = GlobalOp.getGlobal();
1592   if (GV->isThreadLocal())
1593     return true; // Don't want to modify TLS vars.
1594 
1595   auto &TM = ST->getTargetLowering()->getTargetMachine();
1596   unsigned OpFlags = ST->ClassifyGlobalReference(GV, TM);
1597 
1598   if (OpFlags & AArch64II::MO_GOT)
1599     return true;
1600 
1601   auto Offset = GlobalOp.getOffset();
1602   Register DstReg = MI.getOperand(0).getReg();
1603   auto ADRP = MIRBuilder.buildInstr(AArch64::ADRP, {LLT::pointer(0, 64)}, {})
1604                   .addGlobalAddress(GV, Offset, OpFlags | AArch64II::MO_PAGE);
1605   // Set the regclass on the dest reg too.
1606   MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
1607 
1608   // MO_TAGGED on the page indicates a tagged address. Set the tag now. We do so
1609   // by creating a MOVK that sets bits 48-63 of the register to (global address
1610   // + 0x100000000 - PC) >> 48. The additional 0x100000000 offset here is to
1611   // prevent an incorrect tag being generated during relocation when the
1612   // global appears before the code section. Without the offset, a global at
1613   // `0x0f00'0000'0000'1000` (i.e. at `0x1000` with tag `0xf`) that's referenced
1614   // by code at `0x2000` would result in `0x0f00'0000'0000'1000 - 0x2000 =
1615   // 0x0eff'ffff'ffff'f000`, meaning the tag would be incorrectly set to `0xe`
1616   // instead of `0xf`.
1617   // This assumes that we're in the small code model so we can assume a binary
1618   // size of <= 4GB, which makes the untagged PC relative offset positive. The
1619   // binary must also be loaded into address range [0, 2^48). Both of these
1620   // properties need to be ensured at runtime when using tagged addresses.
1621   if (OpFlags & AArch64II::MO_TAGGED) {
1622     assert(!Offset &&
1623            "Should not have folded in an offset for a tagged global!");
1624     ADRP = MIRBuilder.buildInstr(AArch64::MOVKXi, {LLT::pointer(0, 64)}, {ADRP})
1625                .addGlobalAddress(GV, 0x100000000,
1626                                  AArch64II::MO_PREL | AArch64II::MO_G3)
1627                .addImm(48);
1628     MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
1629   }
1630 
1631   MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP})
1632       .addGlobalAddress(GV, Offset,
1633                         OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
1634   MI.eraseFromParent();
1635   return true;
1636 }
1637 
legalizeIntrinsic(LegalizerHelper & Helper,MachineInstr & MI) const1638 bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
1639                                              MachineInstr &MI) const {
1640   MachineIRBuilder &MIB = Helper.MIRBuilder;
1641   MachineRegisterInfo &MRI = *MIB.getMRI();
1642 
1643   auto LowerBinOp = [&MI, &MIB](unsigned Opcode) {
1644     MIB.buildInstr(Opcode, {MI.getOperand(0)},
1645                    {MI.getOperand(2), MI.getOperand(3)});
1646     MI.eraseFromParent();
1647     return true;
1648   };
1649 
1650   Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
1651   switch (IntrinsicID) {
1652   case Intrinsic::vacopy: {
1653     unsigned PtrSize = ST->isTargetILP32() ? 4 : 8;
1654     unsigned VaListSize =
1655       (ST->isTargetDarwin() || ST->isTargetWindows())
1656           ? PtrSize
1657           : ST->isTargetILP32() ? 20 : 32;
1658 
1659     MachineFunction &MF = *MI.getMF();
1660     auto Val = MF.getRegInfo().createGenericVirtualRegister(
1661         LLT::scalar(VaListSize * 8));
1662     MIB.buildLoad(Val, MI.getOperand(2),
1663                   *MF.getMachineMemOperand(MachinePointerInfo(),
1664                                            MachineMemOperand::MOLoad,
1665                                            VaListSize, Align(PtrSize)));
1666     MIB.buildStore(Val, MI.getOperand(1),
1667                    *MF.getMachineMemOperand(MachinePointerInfo(),
1668                                             MachineMemOperand::MOStore,
1669                                             VaListSize, Align(PtrSize)));
1670     MI.eraseFromParent();
1671     return true;
1672   }
1673   case Intrinsic::get_dynamic_area_offset: {
1674     MIB.buildConstant(MI.getOperand(0).getReg(), 0);
1675     MI.eraseFromParent();
1676     return true;
1677   }
1678   case Intrinsic::aarch64_mops_memset_tag: {
1679     assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
1680     // Anyext the value being set to 64 bit (only the bottom 8 bits are read by
1681     // the instruction).
1682     auto &Value = MI.getOperand(3);
1683     Register ExtValueReg = MIB.buildAnyExt(LLT::scalar(64), Value).getReg(0);
1684     Value.setReg(ExtValueReg);
1685     return true;
1686   }
1687   case Intrinsic::aarch64_prefetch: {
1688     auto &AddrVal = MI.getOperand(1);
1689 
1690     int64_t IsWrite = MI.getOperand(2).getImm();
1691     int64_t Target = MI.getOperand(3).getImm();
1692     int64_t IsStream = MI.getOperand(4).getImm();
1693     int64_t IsData = MI.getOperand(5).getImm();
1694 
1695     unsigned PrfOp = (IsWrite << 4) |    // Load/Store bit
1696                      (!IsData << 3) |    // IsDataCache bit
1697                      (Target << 1) |     // Cache level bits
1698                      (unsigned)IsStream; // Stream bit
1699 
1700     MIB.buildInstr(AArch64::G_AARCH64_PREFETCH).addImm(PrfOp).add(AddrVal);
1701     MI.eraseFromParent();
1702     return true;
1703   }
1704   case Intrinsic::aarch64_neon_uaddv:
1705   case Intrinsic::aarch64_neon_saddv:
1706   case Intrinsic::aarch64_neon_umaxv:
1707   case Intrinsic::aarch64_neon_smaxv:
1708   case Intrinsic::aarch64_neon_uminv:
1709   case Intrinsic::aarch64_neon_sminv: {
1710     bool IsSigned = IntrinsicID == Intrinsic::aarch64_neon_saddv ||
1711                     IntrinsicID == Intrinsic::aarch64_neon_smaxv ||
1712                     IntrinsicID == Intrinsic::aarch64_neon_sminv;
1713 
1714     auto OldDst = MI.getOperand(0).getReg();
1715     auto OldDstTy = MRI.getType(OldDst);
1716     LLT NewDstTy = MRI.getType(MI.getOperand(2).getReg()).getElementType();
1717     if (OldDstTy == NewDstTy)
1718       return true;
1719 
1720     auto NewDst = MRI.createGenericVirtualRegister(NewDstTy);
1721 
1722     Helper.Observer.changingInstr(MI);
1723     MI.getOperand(0).setReg(NewDst);
1724     Helper.Observer.changedInstr(MI);
1725 
1726     MIB.setInsertPt(MIB.getMBB(), ++MIB.getInsertPt());
1727     MIB.buildExtOrTrunc(IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT,
1728                         OldDst, NewDst);
1729 
1730     return true;
1731   }
1732   case Intrinsic::aarch64_neon_uaddlp:
1733   case Intrinsic::aarch64_neon_saddlp: {
1734     unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlp
1735                        ? AArch64::G_UADDLP
1736                        : AArch64::G_SADDLP;
1737     MIB.buildInstr(Opc, {MI.getOperand(0)}, {MI.getOperand(2)});
1738     MI.eraseFromParent();
1739 
1740     return true;
1741   }
1742   case Intrinsic::aarch64_neon_uaddlv:
1743   case Intrinsic::aarch64_neon_saddlv: {
1744     unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlv
1745                        ? AArch64::G_UADDLV
1746                        : AArch64::G_SADDLV;
1747     Register DstReg = MI.getOperand(0).getReg();
1748     Register SrcReg = MI.getOperand(2).getReg();
1749     LLT DstTy = MRI.getType(DstReg);
1750 
1751     LLT MidTy, ExtTy;
1752     if (DstTy.isScalar() && DstTy.getScalarSizeInBits() <= 32) {
1753       MidTy = LLT::fixed_vector(4, 32);
1754       ExtTy = LLT::scalar(32);
1755     } else {
1756       MidTy = LLT::fixed_vector(2, 64);
1757       ExtTy = LLT::scalar(64);
1758     }
1759 
1760     Register MidReg =
1761         MIB.buildInstr(Opc, {MidTy}, {SrcReg})->getOperand(0).getReg();
1762     Register ZeroReg =
1763         MIB.buildConstant(LLT::scalar(64), 0)->getOperand(0).getReg();
1764     Register ExtReg = MIB.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT, {ExtTy},
1765                                      {MidReg, ZeroReg})
1766                           .getReg(0);
1767 
1768     if (DstTy.getScalarSizeInBits() < 32)
1769       MIB.buildTrunc(DstReg, ExtReg);
1770     else
1771       MIB.buildCopy(DstReg, ExtReg);
1772 
1773     MI.eraseFromParent();
1774 
1775     return true;
1776   }
1777   case Intrinsic::aarch64_neon_smax:
1778     return LowerBinOp(TargetOpcode::G_SMAX);
1779   case Intrinsic::aarch64_neon_smin:
1780     return LowerBinOp(TargetOpcode::G_SMIN);
1781   case Intrinsic::aarch64_neon_umax:
1782     return LowerBinOp(TargetOpcode::G_UMAX);
1783   case Intrinsic::aarch64_neon_umin:
1784     return LowerBinOp(TargetOpcode::G_UMIN);
1785   case Intrinsic::aarch64_neon_fmax:
1786     return LowerBinOp(TargetOpcode::G_FMAXIMUM);
1787   case Intrinsic::aarch64_neon_fmin:
1788     return LowerBinOp(TargetOpcode::G_FMINIMUM);
1789   case Intrinsic::aarch64_neon_fmaxnm:
1790     return LowerBinOp(TargetOpcode::G_FMAXNUM);
1791   case Intrinsic::aarch64_neon_fminnm:
1792     return LowerBinOp(TargetOpcode::G_FMINNUM);
1793   case Intrinsic::aarch64_neon_smull:
1794     return LowerBinOp(AArch64::G_SMULL);
1795   case Intrinsic::aarch64_neon_umull:
1796     return LowerBinOp(AArch64::G_UMULL);
1797   case Intrinsic::aarch64_neon_abs: {
1798     // Lower the intrinsic to G_ABS.
1799     MIB.buildInstr(TargetOpcode::G_ABS, {MI.getOperand(0)}, {MI.getOperand(2)});
1800     MI.eraseFromParent();
1801     return true;
1802   }
1803   case Intrinsic::aarch64_neon_sqadd: {
1804     if (MRI.getType(MI.getOperand(0).getReg()).isVector())
1805       return LowerBinOp(TargetOpcode::G_SADDSAT);
1806     break;
1807   }
1808   case Intrinsic::aarch64_neon_sqsub: {
1809     if (MRI.getType(MI.getOperand(0).getReg()).isVector())
1810       return LowerBinOp(TargetOpcode::G_SSUBSAT);
1811     break;
1812   }
1813   case Intrinsic::aarch64_neon_uqadd: {
1814     if (MRI.getType(MI.getOperand(0).getReg()).isVector())
1815       return LowerBinOp(TargetOpcode::G_UADDSAT);
1816     break;
1817   }
1818   case Intrinsic::aarch64_neon_uqsub: {
1819     if (MRI.getType(MI.getOperand(0).getReg()).isVector())
1820       return LowerBinOp(TargetOpcode::G_USUBSAT);
1821     break;
1822   }
1823 
1824   case Intrinsic::vector_reverse:
1825     // TODO: Add support for vector_reverse
1826     return false;
1827   }
1828 
1829   return true;
1830 }
1831 
legalizeShlAshrLshr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & MIRBuilder,GISelChangeObserver & Observer) const1832 bool AArch64LegalizerInfo::legalizeShlAshrLshr(
1833     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
1834     GISelChangeObserver &Observer) const {
1835   assert(MI.getOpcode() == TargetOpcode::G_ASHR ||
1836          MI.getOpcode() == TargetOpcode::G_LSHR ||
1837          MI.getOpcode() == TargetOpcode::G_SHL);
1838   // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the
1839   // imported patterns can select it later. Either way, it will be legal.
1840   Register AmtReg = MI.getOperand(2).getReg();
1841   auto VRegAndVal = getIConstantVRegValWithLookThrough(AmtReg, MRI);
1842   if (!VRegAndVal)
1843     return true;
1844   // Check the shift amount is in range for an immediate form.
1845   int64_t Amount = VRegAndVal->Value.getSExtValue();
1846   if (Amount > 31)
1847     return true; // This will have to remain a register variant.
1848   auto ExtCst = MIRBuilder.buildConstant(LLT::scalar(64), Amount);
1849   Observer.changingInstr(MI);
1850   MI.getOperand(2).setReg(ExtCst.getReg(0));
1851   Observer.changedInstr(MI);
1852   return true;
1853 }
1854 
matchLDPSTPAddrMode(Register Root,Register & Base,int & Offset,MachineRegisterInfo & MRI)1855 static void matchLDPSTPAddrMode(Register Root, Register &Base, int &Offset,
1856                                 MachineRegisterInfo &MRI) {
1857   Base = Root;
1858   Offset = 0;
1859 
1860   Register NewBase;
1861   int64_t NewOffset;
1862   if (mi_match(Root, MRI, m_GPtrAdd(m_Reg(NewBase), m_ICst(NewOffset))) &&
1863       isShiftedInt<7, 3>(NewOffset)) {
1864     Base = NewBase;
1865     Offset = NewOffset;
1866   }
1867 }
1868 
1869 // FIXME: This should be removed and replaced with the generic bitcast legalize
1870 // action.
legalizeLoadStore(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & MIRBuilder,GISelChangeObserver & Observer) const1871 bool AArch64LegalizerInfo::legalizeLoadStore(
1872     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
1873     GISelChangeObserver &Observer) const {
1874   assert(MI.getOpcode() == TargetOpcode::G_STORE ||
1875          MI.getOpcode() == TargetOpcode::G_LOAD);
1876   // Here we just try to handle vector loads/stores where our value type might
1877   // have pointer elements, which the SelectionDAG importer can't handle. To
1878   // allow the existing patterns for s64 to fire for p0, we just try to bitcast
1879   // the value to use s64 types.
1880 
1881   // Custom legalization requires the instruction, if not deleted, must be fully
1882   // legalized. In order to allow further legalization of the inst, we create
1883   // a new instruction and erase the existing one.
1884 
1885   Register ValReg = MI.getOperand(0).getReg();
1886   const LLT ValTy = MRI.getType(ValReg);
1887 
1888   if (ValTy == LLT::scalar(128)) {
1889 
1890     AtomicOrdering Ordering = (*MI.memoperands_begin())->getSuccessOrdering();
1891     bool IsLoad = MI.getOpcode() == TargetOpcode::G_LOAD;
1892     bool IsLoadAcquire = IsLoad && Ordering == AtomicOrdering::Acquire;
1893     bool IsStoreRelease = !IsLoad && Ordering == AtomicOrdering::Release;
1894     bool IsRcpC3 =
1895         ST->hasLSE2() && ST->hasRCPC3() && (IsLoadAcquire || IsStoreRelease);
1896 
1897     LLT s64 = LLT::scalar(64);
1898 
1899     unsigned Opcode;
1900     if (IsRcpC3) {
1901       Opcode = IsLoad ? AArch64::LDIAPPX : AArch64::STILPX;
1902     } else {
1903       // For LSE2, loads/stores should have been converted to monotonic and had
1904       // a fence inserted after them.
1905       assert(Ordering == AtomicOrdering::Monotonic ||
1906              Ordering == AtomicOrdering::Unordered);
1907       assert(ST->hasLSE2() && "ldp/stp not single copy atomic without +lse2");
1908 
1909       Opcode = IsLoad ? AArch64::LDPXi : AArch64::STPXi;
1910     }
1911 
1912     MachineInstrBuilder NewI;
1913     if (IsLoad) {
1914       NewI = MIRBuilder.buildInstr(Opcode, {s64, s64}, {});
1915       MIRBuilder.buildMergeLikeInstr(
1916           ValReg, {NewI->getOperand(0), NewI->getOperand(1)});
1917     } else {
1918       auto Split = MIRBuilder.buildUnmerge(s64, MI.getOperand(0));
1919       NewI = MIRBuilder.buildInstr(
1920           Opcode, {}, {Split->getOperand(0), Split->getOperand(1)});
1921     }
1922 
1923     if (IsRcpC3) {
1924       NewI.addUse(MI.getOperand(1).getReg());
1925     } else {
1926       Register Base;
1927       int Offset;
1928       matchLDPSTPAddrMode(MI.getOperand(1).getReg(), Base, Offset, MRI);
1929       NewI.addUse(Base);
1930       NewI.addImm(Offset / 8);
1931     }
1932 
1933     NewI.cloneMemRefs(MI);
1934     constrainSelectedInstRegOperands(*NewI, *ST->getInstrInfo(),
1935                                      *MRI.getTargetRegisterInfo(),
1936                                      *ST->getRegBankInfo());
1937     MI.eraseFromParent();
1938     return true;
1939   }
1940 
1941   if (!ValTy.isPointerVector() ||
1942       ValTy.getElementType().getAddressSpace() != 0) {
1943     LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store");
1944     return false;
1945   }
1946 
1947   unsigned PtrSize = ValTy.getElementType().getSizeInBits();
1948   const LLT NewTy = LLT::vector(ValTy.getElementCount(), PtrSize);
1949   auto &MMO = **MI.memoperands_begin();
1950   MMO.setType(NewTy);
1951 
1952   if (MI.getOpcode() == TargetOpcode::G_STORE) {
1953     auto Bitcast = MIRBuilder.buildBitcast(NewTy, ValReg);
1954     MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1), MMO);
1955   } else {
1956     auto NewLoad = MIRBuilder.buildLoad(NewTy, MI.getOperand(1), MMO);
1957     MIRBuilder.buildBitcast(ValReg, NewLoad);
1958   }
1959   MI.eraseFromParent();
1960   return true;
1961 }
1962 
legalizeVaArg(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & MIRBuilder) const1963 bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
1964                                          MachineRegisterInfo &MRI,
1965                                          MachineIRBuilder &MIRBuilder) const {
1966   MachineFunction &MF = MIRBuilder.getMF();
1967   Align Alignment(MI.getOperand(2).getImm());
1968   Register Dst = MI.getOperand(0).getReg();
1969   Register ListPtr = MI.getOperand(1).getReg();
1970 
1971   LLT PtrTy = MRI.getType(ListPtr);
1972   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1973 
1974   const unsigned PtrSize = PtrTy.getSizeInBits() / 8;
1975   const Align PtrAlign = Align(PtrSize);
1976   auto List = MIRBuilder.buildLoad(
1977       PtrTy, ListPtr,
1978       *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
1979                                PtrTy, PtrAlign));
1980 
1981   MachineInstrBuilder DstPtr;
1982   if (Alignment > PtrAlign) {
1983     // Realign the list to the actual required alignment.
1984     auto AlignMinus1 =
1985         MIRBuilder.buildConstant(IntPtrTy, Alignment.value() - 1);
1986     auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0));
1987     DstPtr = MIRBuilder.buildMaskLowPtrBits(PtrTy, ListTmp, Log2(Alignment));
1988   } else
1989     DstPtr = List;
1990 
1991   LLT ValTy = MRI.getType(Dst);
1992   uint64_t ValSize = ValTy.getSizeInBits() / 8;
1993   MIRBuilder.buildLoad(
1994       Dst, DstPtr,
1995       *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
1996                                ValTy, std::max(Alignment, PtrAlign)));
1997 
1998   auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrAlign));
1999 
2000   auto NewList = MIRBuilder.buildPtrAdd(PtrTy, DstPtr, Size.getReg(0));
2001 
2002   MIRBuilder.buildStore(NewList, ListPtr,
2003                         *MF.getMachineMemOperand(MachinePointerInfo(),
2004                                                  MachineMemOperand::MOStore,
2005                                                  PtrTy, PtrAlign));
2006 
2007   MI.eraseFromParent();
2008   return true;
2009 }
2010 
legalizeBitfieldExtract(MachineInstr & MI,MachineRegisterInfo & MRI,LegalizerHelper & Helper) const2011 bool AArch64LegalizerInfo::legalizeBitfieldExtract(
2012     MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
2013   // Only legal if we can select immediate forms.
2014   // TODO: Lower this otherwise.
2015   return getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI) &&
2016          getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2017 }
2018 
legalizeCTPOP(MachineInstr & MI,MachineRegisterInfo & MRI,LegalizerHelper & Helper) const2019 bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI,
2020                                          MachineRegisterInfo &MRI,
2021                                          LegalizerHelper &Helper) const {
2022   // When there is no integer popcount instruction (FEAT_CSSC isn't available),
2023   // it can be more efficiently lowered to the following sequence that uses
2024   // AdvSIMD registers/instructions as long as the copies to/from the AdvSIMD
2025   // registers are cheap.
2026   //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
2027   //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
2028   //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
2029   //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
2030   //
2031   // For 128 bit vector popcounts, we lower to the following sequence:
2032   //  cnt.16b   v0, v0  // v8s16, v4s32, v2s64
2033   //  uaddlp.8h v0, v0  // v8s16, v4s32, v2s64
2034   //  uaddlp.4s v0, v0  //        v4s32, v2s64
2035   //  uaddlp.2d v0, v0  //               v2s64
2036   //
2037   // For 64 bit vector popcounts, we lower to the following sequence:
2038   //  cnt.8b    v0, v0  // v4s16, v2s32
2039   //  uaddlp.4h v0, v0  // v4s16, v2s32
2040   //  uaddlp.2s v0, v0  //        v2s32
2041 
2042   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2043   Register Dst = MI.getOperand(0).getReg();
2044   Register Val = MI.getOperand(1).getReg();
2045   LLT Ty = MRI.getType(Val);
2046   unsigned Size = Ty.getSizeInBits();
2047 
2048   assert(Ty == MRI.getType(Dst) &&
2049          "Expected src and dst to have the same type!");
2050 
2051   if (ST->hasCSSC() && Ty.isScalar() && Size == 128) {
2052     LLT s64 = LLT::scalar(64);
2053 
2054     auto Split = MIRBuilder.buildUnmerge(s64, Val);
2055     auto CTPOP1 = MIRBuilder.buildCTPOP(s64, Split->getOperand(0));
2056     auto CTPOP2 = MIRBuilder.buildCTPOP(s64, Split->getOperand(1));
2057     auto Add = MIRBuilder.buildAdd(s64, CTPOP1, CTPOP2);
2058 
2059     MIRBuilder.buildZExt(Dst, Add);
2060     MI.eraseFromParent();
2061     return true;
2062   }
2063 
2064   if (!ST->hasNEON() ||
2065       MI.getMF()->getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) {
2066     // Use generic lowering when custom lowering is not possible.
2067     return Ty.isScalar() && (Size == 32 || Size == 64) &&
2068            Helper.lowerBitCount(MI) ==
2069                LegalizerHelper::LegalizeResult::Legalized;
2070   }
2071 
2072   // Pre-conditioning: widen Val up to the nearest vector type.
2073   // s32,s64,v4s16,v2s32 -> v8i8
2074   // v8s16,v4s32,v2s64 -> v16i8
2075   LLT VTy = Size == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8);
2076   if (Ty.isScalar()) {
2077     assert((Size == 32 || Size == 64 || Size == 128) && "Expected only 32, 64, or 128 bit scalars!");
2078     if (Size == 32) {
2079       Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0);
2080     }
2081   }
2082   Val = MIRBuilder.buildBitcast(VTy, Val).getReg(0);
2083 
2084   // Count bits in each byte-sized lane.
2085   auto CTPOP = MIRBuilder.buildCTPOP(VTy, Val);
2086 
2087   // Sum across lanes.
2088 
2089   if (ST->hasDotProd() && Ty.isVector() && Ty.getNumElements() >= 2 &&
2090       Ty.getScalarSizeInBits() != 16) {
2091     LLT Dt = Ty == LLT::fixed_vector(2, 64) ? LLT::fixed_vector(4, 32) : Ty;
2092     auto Zeros = MIRBuilder.buildConstant(Dt, 0);
2093     auto Ones = MIRBuilder.buildConstant(VTy, 1);
2094     MachineInstrBuilder Sum;
2095 
2096     if (Ty == LLT::fixed_vector(2, 64)) {
2097       auto UDOT =
2098           MIRBuilder.buildInstr(AArch64::G_UDOT, {Dt}, {Zeros, Ones, CTPOP});
2099       Sum = MIRBuilder.buildInstr(AArch64::G_UADDLP, {Ty}, {UDOT});
2100     } else if (Ty == LLT::fixed_vector(4, 32)) {
2101       Sum = MIRBuilder.buildInstr(AArch64::G_UDOT, {Dt}, {Zeros, Ones, CTPOP});
2102     } else if (Ty == LLT::fixed_vector(2, 32)) {
2103       Sum = MIRBuilder.buildInstr(AArch64::G_UDOT, {Dt}, {Zeros, Ones, CTPOP});
2104     } else {
2105       llvm_unreachable("unexpected vector shape");
2106     }
2107 
2108     Sum->getOperand(0).setReg(Dst);
2109     MI.eraseFromParent();
2110     return true;
2111   }
2112 
2113   Register HSum = CTPOP.getReg(0);
2114   unsigned Opc;
2115   SmallVector<LLT> HAddTys;
2116   if (Ty.isScalar()) {
2117     Opc = Intrinsic::aarch64_neon_uaddlv;
2118     HAddTys.push_back(LLT::scalar(32));
2119   } else if (Ty == LLT::fixed_vector(8, 16)) {
2120     Opc = Intrinsic::aarch64_neon_uaddlp;
2121     HAddTys.push_back(LLT::fixed_vector(8, 16));
2122   } else if (Ty == LLT::fixed_vector(4, 32)) {
2123     Opc = Intrinsic::aarch64_neon_uaddlp;
2124     HAddTys.push_back(LLT::fixed_vector(8, 16));
2125     HAddTys.push_back(LLT::fixed_vector(4, 32));
2126   } else if (Ty == LLT::fixed_vector(2, 64)) {
2127     Opc = Intrinsic::aarch64_neon_uaddlp;
2128     HAddTys.push_back(LLT::fixed_vector(8, 16));
2129     HAddTys.push_back(LLT::fixed_vector(4, 32));
2130     HAddTys.push_back(LLT::fixed_vector(2, 64));
2131   } else if (Ty == LLT::fixed_vector(4, 16)) {
2132     Opc = Intrinsic::aarch64_neon_uaddlp;
2133     HAddTys.push_back(LLT::fixed_vector(4, 16));
2134   } else if (Ty == LLT::fixed_vector(2, 32)) {
2135     Opc = Intrinsic::aarch64_neon_uaddlp;
2136     HAddTys.push_back(LLT::fixed_vector(4, 16));
2137     HAddTys.push_back(LLT::fixed_vector(2, 32));
2138   } else
2139     llvm_unreachable("unexpected vector shape");
2140   MachineInstrBuilder UADD;
2141   for (LLT HTy : HAddTys) {
2142     UADD = MIRBuilder.buildIntrinsic(Opc, {HTy}).addUse(HSum);
2143     HSum = UADD.getReg(0);
2144   }
2145 
2146   // Post-conditioning.
2147   if (Ty.isScalar() && (Size == 64 || Size == 128))
2148     MIRBuilder.buildZExt(Dst, UADD);
2149   else
2150     UADD->getOperand(0).setReg(Dst);
2151   MI.eraseFromParent();
2152   return true;
2153 }
2154 
legalizeAtomicCmpxchg128(MachineInstr & MI,MachineRegisterInfo & MRI,LegalizerHelper & Helper) const2155 bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128(
2156     MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
2157   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2158   LLT s64 = LLT::scalar(64);
2159   auto Addr = MI.getOperand(1).getReg();
2160   auto DesiredI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(2));
2161   auto NewI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(3));
2162   auto DstLo = MRI.createGenericVirtualRegister(s64);
2163   auto DstHi = MRI.createGenericVirtualRegister(s64);
2164 
2165   MachineInstrBuilder CAS;
2166   if (ST->hasLSE()) {
2167     // We have 128-bit CASP instructions taking XSeqPair registers, which are
2168     // s128. We need the merge/unmerge to bracket the expansion and pair up with
2169     // the rest of the MIR so we must reassemble the extracted registers into a
2170     // 128-bit known-regclass one with code like this:
2171     //
2172     //     %in1 = REG_SEQUENCE Lo, Hi    ; One for each input
2173     //     %out = CASP %in1, ...
2174     //     %OldLo = G_EXTRACT %out, 0
2175     //     %OldHi = G_EXTRACT %out, 64
2176     auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
2177     unsigned Opcode;
2178     switch (Ordering) {
2179     case AtomicOrdering::Acquire:
2180       Opcode = AArch64::CASPAX;
2181       break;
2182     case AtomicOrdering::Release:
2183       Opcode = AArch64::CASPLX;
2184       break;
2185     case AtomicOrdering::AcquireRelease:
2186     case AtomicOrdering::SequentiallyConsistent:
2187       Opcode = AArch64::CASPALX;
2188       break;
2189     default:
2190       Opcode = AArch64::CASPX;
2191       break;
2192     }
2193 
2194     LLT s128 = LLT::scalar(128);
2195     auto CASDst = MRI.createGenericVirtualRegister(s128);
2196     auto CASDesired = MRI.createGenericVirtualRegister(s128);
2197     auto CASNew = MRI.createGenericVirtualRegister(s128);
2198     MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASDesired}, {})
2199         .addUse(DesiredI->getOperand(0).getReg())
2200         .addImm(AArch64::sube64)
2201         .addUse(DesiredI->getOperand(1).getReg())
2202         .addImm(AArch64::subo64);
2203     MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASNew}, {})
2204         .addUse(NewI->getOperand(0).getReg())
2205         .addImm(AArch64::sube64)
2206         .addUse(NewI->getOperand(1).getReg())
2207         .addImm(AArch64::subo64);
2208 
2209     CAS = MIRBuilder.buildInstr(Opcode, {CASDst}, {CASDesired, CASNew, Addr});
2210 
2211     MIRBuilder.buildExtract({DstLo}, {CASDst}, 0);
2212     MIRBuilder.buildExtract({DstHi}, {CASDst}, 64);
2213   } else {
2214     // The -O0 CMP_SWAP_128 is friendlier to generate code for because LDXP/STXP
2215     // can take arbitrary registers so it just has the normal GPR64 operands the
2216     // rest of AArch64 is expecting.
2217     auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
2218     unsigned Opcode;
2219     switch (Ordering) {
2220     case AtomicOrdering::Acquire:
2221       Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
2222       break;
2223     case AtomicOrdering::Release:
2224       Opcode = AArch64::CMP_SWAP_128_RELEASE;
2225       break;
2226     case AtomicOrdering::AcquireRelease:
2227     case AtomicOrdering::SequentiallyConsistent:
2228       Opcode = AArch64::CMP_SWAP_128;
2229       break;
2230     default:
2231       Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
2232       break;
2233     }
2234 
2235     auto Scratch = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
2236     CAS = MIRBuilder.buildInstr(Opcode, {DstLo, DstHi, Scratch},
2237                                 {Addr, DesiredI->getOperand(0),
2238                                  DesiredI->getOperand(1), NewI->getOperand(0),
2239                                  NewI->getOperand(1)});
2240   }
2241 
2242   CAS.cloneMemRefs(MI);
2243   constrainSelectedInstRegOperands(*CAS, *ST->getInstrInfo(),
2244                                    *MRI.getTargetRegisterInfo(),
2245                                    *ST->getRegBankInfo());
2246 
2247   MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {DstLo, DstHi});
2248   MI.eraseFromParent();
2249   return true;
2250 }
2251 
legalizeCTTZ(MachineInstr & MI,LegalizerHelper & Helper) const2252 bool AArch64LegalizerInfo::legalizeCTTZ(MachineInstr &MI,
2253                                         LegalizerHelper &Helper) const {
2254   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2255   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
2256   LLT Ty = MRI.getType(MI.getOperand(1).getReg());
2257   auto BitReverse = MIRBuilder.buildBitReverse(Ty, MI.getOperand(1));
2258   MIRBuilder.buildCTLZ(MI.getOperand(0).getReg(), BitReverse);
2259   MI.eraseFromParent();
2260   return true;
2261 }
2262 
legalizeMemOps(MachineInstr & MI,LegalizerHelper & Helper) const2263 bool AArch64LegalizerInfo::legalizeMemOps(MachineInstr &MI,
2264                                           LegalizerHelper &Helper) const {
2265   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2266 
2267   // Tagged version MOPSMemorySetTagged is legalised in legalizeIntrinsic
2268   if (MI.getOpcode() == TargetOpcode::G_MEMSET) {
2269     // Anyext the value being set to 64 bit (only the bottom 8 bits are read by
2270     // the instruction).
2271     auto &Value = MI.getOperand(1);
2272     Register ExtValueReg =
2273         MIRBuilder.buildAnyExt(LLT::scalar(64), Value).getReg(0);
2274     Value.setReg(ExtValueReg);
2275     return true;
2276   }
2277 
2278   return false;
2279 }
2280 
legalizeExtractVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,LegalizerHelper & Helper) const2281 bool AArch64LegalizerInfo::legalizeExtractVectorElt(
2282     MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
2283   const GExtractVectorElement *Element = cast<GExtractVectorElement>(&MI);
2284   auto VRegAndVal =
2285       getIConstantVRegValWithLookThrough(Element->getIndexReg(), MRI);
2286   if (VRegAndVal)
2287     return true;
2288   LLT VecTy = MRI.getType(Element->getVectorReg());
2289   if (VecTy.isScalableVector())
2290     return true;
2291   return Helper.lowerExtractInsertVectorElt(MI) !=
2292          LegalizerHelper::LegalizeResult::UnableToLegalize;
2293 }
2294 
legalizeDynStackAlloc(MachineInstr & MI,LegalizerHelper & Helper) const2295 bool AArch64LegalizerInfo::legalizeDynStackAlloc(
2296     MachineInstr &MI, LegalizerHelper &Helper) const {
2297   MachineFunction &MF = *MI.getParent()->getParent();
2298   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2299   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
2300 
2301   // If stack probing is not enabled for this function, use the default
2302   // lowering.
2303   if (!MF.getFunction().hasFnAttribute("probe-stack") ||
2304       MF.getFunction().getFnAttribute("probe-stack").getValueAsString() !=
2305           "inline-asm") {
2306     Helper.lowerDynStackAlloc(MI);
2307     return true;
2308   }
2309 
2310   Register Dst = MI.getOperand(0).getReg();
2311   Register AllocSize = MI.getOperand(1).getReg();
2312   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
2313 
2314   assert(MRI.getType(Dst) == LLT::pointer(0, 64) &&
2315          "Unexpected type for dynamic alloca");
2316   assert(MRI.getType(AllocSize) == LLT::scalar(64) &&
2317          "Unexpected type for dynamic alloca");
2318 
2319   LLT PtrTy = MRI.getType(Dst);
2320   Register SPReg =
2321       Helper.getTargetLowering().getStackPointerRegisterToSaveRestore();
2322   Register SPTmp =
2323       Helper.getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy);
2324   auto NewMI =
2325       MIRBuilder.buildInstr(AArch64::PROBED_STACKALLOC_DYN, {}, {SPTmp});
2326   MRI.setRegClass(NewMI.getReg(0), &AArch64::GPR64commonRegClass);
2327   MIRBuilder.setInsertPt(*NewMI->getParent(), NewMI);
2328   MIRBuilder.buildCopy(Dst, SPTmp);
2329 
2330   MI.eraseFromParent();
2331   return true;
2332 }
2333 
legalizePrefetch(MachineInstr & MI,LegalizerHelper & Helper) const2334 bool AArch64LegalizerInfo::legalizePrefetch(MachineInstr &MI,
2335                                             LegalizerHelper &Helper) const {
2336   MachineIRBuilder &MIB = Helper.MIRBuilder;
2337   auto &AddrVal = MI.getOperand(0);
2338 
2339   int64_t IsWrite = MI.getOperand(1).getImm();
2340   int64_t Locality = MI.getOperand(2).getImm();
2341   int64_t IsData = MI.getOperand(3).getImm();
2342 
2343   bool IsStream = Locality == 0;
2344   if (Locality != 0) {
2345     assert(Locality <= 3 && "Prefetch locality out-of-range");
2346     // The locality degree is the opposite of the cache speed.
2347     // Put the number the other way around.
2348     // The encoding starts at 0 for level 1
2349     Locality = 3 - Locality;
2350   }
2351 
2352   unsigned PrfOp = (IsWrite << 4) | (!IsData << 3) | (Locality << 1) | IsStream;
2353 
2354   MIB.buildInstr(AArch64::G_AARCH64_PREFETCH).addImm(PrfOp).add(AddrVal);
2355   MI.eraseFromParent();
2356   return true;
2357 }
2358