1 //===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AArch64.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13
14 #include "AArch64LegalizerInfo.h"
15 #include "AArch64Subtarget.h"
16 #include "llvm/ADT/STLExtras.h"
17 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
18 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
19 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
20 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
21 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
22 #include "llvm/CodeGen/GlobalISel/Utils.h"
23 #include "llvm/CodeGen/MachineInstr.h"
24 #include "llvm/CodeGen/MachineRegisterInfo.h"
25 #include "llvm/CodeGen/TargetOpcodes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/Intrinsics.h"
28 #include "llvm/IR/IntrinsicsAArch64.h"
29 #include "llvm/IR/Type.h"
30 #include "llvm/Support/MathExtras.h"
31 #include <initializer_list>
32
33 #define DEBUG_TYPE "aarch64-legalinfo"
34
35 using namespace llvm;
36 using namespace LegalizeActions;
37 using namespace LegalizeMutations;
38 using namespace LegalityPredicates;
39 using namespace MIPatternMatch;
40
AArch64LegalizerInfo(const AArch64Subtarget & ST)41 AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
42 : ST(&ST) {
43 using namespace TargetOpcode;
44 const LLT p0 = LLT::pointer(0, 64);
45 const LLT s8 = LLT::scalar(8);
46 const LLT s16 = LLT::scalar(16);
47 const LLT s32 = LLT::scalar(32);
48 const LLT s64 = LLT::scalar(64);
49 const LLT s128 = LLT::scalar(128);
50 const LLT v16s8 = LLT::fixed_vector(16, 8);
51 const LLT v8s8 = LLT::fixed_vector(8, 8);
52 const LLT v4s8 = LLT::fixed_vector(4, 8);
53 const LLT v2s8 = LLT::fixed_vector(2, 8);
54 const LLT v8s16 = LLT::fixed_vector(8, 16);
55 const LLT v4s16 = LLT::fixed_vector(4, 16);
56 const LLT v2s16 = LLT::fixed_vector(2, 16);
57 const LLT v2s32 = LLT::fixed_vector(2, 32);
58 const LLT v4s32 = LLT::fixed_vector(4, 32);
59 const LLT v2s64 = LLT::fixed_vector(2, 64);
60 const LLT v2p0 = LLT::fixed_vector(2, p0);
61
62 const LLT nxv16s8 = LLT::scalable_vector(16, s8);
63 const LLT nxv8s16 = LLT::scalable_vector(8, s16);
64 const LLT nxv4s32 = LLT::scalable_vector(4, s32);
65 const LLT nxv2s64 = LLT::scalable_vector(2, s64);
66
67 std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */
68 v16s8, v8s16, v4s32,
69 v2s64, v2p0,
70 /* End 128bit types */
71 /* Begin 64bit types */
72 v8s8, v4s16, v2s32};
73 std::initializer_list<LLT> ScalarAndPtrTypesList = {s8, s16, s32, s64, p0};
74 SmallVector<LLT, 8> PackedVectorAllTypesVec(PackedVectorAllTypeList);
75 SmallVector<LLT, 8> ScalarAndPtrTypesVec(ScalarAndPtrTypesList);
76
77 const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine();
78
79 // FIXME: support subtargets which have neon/fp-armv8 disabled.
80 if (!ST.hasNEON() || !ST.hasFPARMv8()) {
81 getLegacyLegalizerInfo().computeTables();
82 return;
83 }
84
85 // Some instructions only support s16 if the subtarget has full 16-bit FP
86 // support.
87 const bool HasFP16 = ST.hasFullFP16();
88 const LLT &MinFPScalar = HasFP16 ? s16 : s32;
89
90 const bool HasCSSC = ST.hasCSSC();
91 const bool HasRCPC3 = ST.hasRCPC3();
92 const bool HasSVE = ST.hasSVE();
93
94 getActionDefinitionsBuilder(
95 {G_IMPLICIT_DEF, G_FREEZE, G_CONSTANT_FOLD_BARRIER})
96 .legalFor({p0, s8, s16, s32, s64})
97 .legalFor({v2s8, v4s8, v8s8, v16s8, v2s16, v4s16, v8s16, v2s32, v4s32,
98 v2s64, v2p0})
99 .widenScalarToNextPow2(0)
100 .clampScalar(0, s8, s64)
101 .moreElementsToNextPow2(0)
102 .widenVectorEltsToVectorMinSize(0, 64)
103 .clampNumElements(0, v8s8, v16s8)
104 .clampNumElements(0, v4s16, v8s16)
105 .clampNumElements(0, v2s32, v4s32)
106 .clampMaxNumElements(0, s64, 2)
107 .clampMaxNumElements(0, p0, 2)
108 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
109
110 getActionDefinitionsBuilder(G_PHI)
111 .legalFor({p0, s16, s32, s64})
112 .legalFor(PackedVectorAllTypeList)
113 .widenScalarToNextPow2(0)
114 .moreElementsToNextPow2(0)
115 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
116 .clampScalar(0, s16, s64)
117 .clampNumElements(0, v8s8, v16s8)
118 .clampNumElements(0, v4s16, v8s16)
119 .clampNumElements(0, v2s32, v4s32)
120 .clampMaxNumElements(0, s64, 2)
121 .clampMaxNumElements(0, p0, 2);
122
123 getActionDefinitionsBuilder(G_INSERT)
124 .legalIf(all(typeInSet(0, {s32, s64, p0}), typeInSet(1, {s8, s16, s32}),
125 smallerThan(1, 0)))
126 .widenScalarToNextPow2(0)
127 .clampScalar(0, s32, s64)
128 .widenScalarToNextPow2(1)
129 .minScalar(1, s8)
130 .maxScalarIf(typeInSet(0, {s32}), 1, s16)
131 .maxScalarIf(typeInSet(0, {s64, p0}), 1, s32);
132
133 getActionDefinitionsBuilder(G_EXTRACT)
134 .legalIf(all(typeInSet(0, {s16, s32, s64, p0}),
135 typeInSet(1, {s32, s64, s128, p0}), smallerThan(0, 1)))
136 .widenScalarToNextPow2(1)
137 .clampScalar(1, s32, s128)
138 .widenScalarToNextPow2(0)
139 .minScalar(0, s16)
140 .maxScalarIf(typeInSet(1, {s32}), 0, s16)
141 .maxScalarIf(typeInSet(1, {s64, p0}), 0, s32)
142 .maxScalarIf(typeInSet(1, {s128}), 0, s64);
143
144 getActionDefinitionsBuilder({G_ADD, G_SUB, G_AND, G_OR, G_XOR})
145 .legalFor({s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32, v2s64})
146 .legalFor(HasSVE, {nxv16s8, nxv8s16, nxv4s32, nxv2s64})
147 .widenScalarToNextPow2(0)
148 .clampScalar(0, s32, s64)
149 .clampMaxNumElements(0, s8, 16)
150 .clampMaxNumElements(0, s16, 8)
151 .clampNumElements(0, v2s32, v4s32)
152 .clampNumElements(0, v2s64, v2s64)
153 .minScalarOrEltIf(
154 [=](const LegalityQuery &Query) {
155 return Query.Types[0].getNumElements() <= 2;
156 },
157 0, s32)
158 .minScalarOrEltIf(
159 [=](const LegalityQuery &Query) {
160 return Query.Types[0].getNumElements() <= 4;
161 },
162 0, s16)
163 .minScalarOrEltIf(
164 [=](const LegalityQuery &Query) {
165 return Query.Types[0].getNumElements() <= 16;
166 },
167 0, s8)
168 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
169 .moreElementsToNextPow2(0);
170
171 getActionDefinitionsBuilder(G_MUL)
172 .legalFor({s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32, v2s64})
173 .widenScalarToNextPow2(0)
174 .clampScalar(0, s32, s64)
175 .clampMaxNumElements(0, s8, 16)
176 .clampMaxNumElements(0, s16, 8)
177 .clampNumElements(0, v2s32, v4s32)
178 .clampNumElements(0, v2s64, v2s64)
179 .minScalarOrEltIf(
180 [=](const LegalityQuery &Query) {
181 return Query.Types[0].getNumElements() <= 2;
182 },
183 0, s32)
184 .minScalarOrEltIf(
185 [=](const LegalityQuery &Query) {
186 return Query.Types[0].getNumElements() <= 4;
187 },
188 0, s16)
189 .minScalarOrEltIf(
190 [=](const LegalityQuery &Query) {
191 return Query.Types[0].getNumElements() <= 16;
192 },
193 0, s8)
194 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
195 .moreElementsToNextPow2(0);
196
197 getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR})
198 .customIf([=](const LegalityQuery &Query) {
199 const auto &SrcTy = Query.Types[0];
200 const auto &AmtTy = Query.Types[1];
201 return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
202 AmtTy.getSizeInBits() == 32;
203 })
204 .legalFor({
205 {s32, s32},
206 {s32, s64},
207 {s64, s64},
208 {v8s8, v8s8},
209 {v16s8, v16s8},
210 {v4s16, v4s16},
211 {v8s16, v8s16},
212 {v2s32, v2s32},
213 {v4s32, v4s32},
214 {v2s64, v2s64},
215 })
216 .widenScalarToNextPow2(0)
217 .clampScalar(1, s32, s64)
218 .clampScalar(0, s32, s64)
219 .clampNumElements(0, v8s8, v16s8)
220 .clampNumElements(0, v4s16, v8s16)
221 .clampNumElements(0, v2s32, v4s32)
222 .clampNumElements(0, v2s64, v2s64)
223 .moreElementsToNextPow2(0)
224 .minScalarSameAs(1, 0)
225 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
226
227 getActionDefinitionsBuilder(G_PTR_ADD)
228 .legalFor({{p0, s64}, {v2p0, v2s64}})
229 .clampScalarOrElt(1, s64, s64)
230 .clampNumElements(0, v2p0, v2p0);
231
232 getActionDefinitionsBuilder(G_PTRMASK).legalFor({{p0, s64}});
233
234 getActionDefinitionsBuilder({G_SDIV, G_UDIV})
235 .legalFor({s32, s64})
236 .libcallFor({s128})
237 .clampScalar(0, s32, s64)
238 .widenScalarToNextPow2(0)
239 .scalarize(0);
240
241 getActionDefinitionsBuilder({G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
242 .lowerFor({s8, s16, s32, s64, v2s32, v4s32, v2s64})
243 .libcallFor({s128})
244 .widenScalarOrEltToNextPow2(0)
245 .minScalarOrElt(0, s32)
246 .clampNumElements(0, v2s32, v4s32)
247 .clampNumElements(0, v2s64, v2s64)
248 .scalarize(0);
249
250 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
251 .widenScalarToNextPow2(0, /*Min = */ 32)
252 .clampScalar(0, s32, s64)
253 .lower();
254
255 getActionDefinitionsBuilder({G_SMULH, G_UMULH})
256 .legalFor({s64, v16s8, v8s16, v4s32})
257 .lower();
258
259 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
260 .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
261 .legalFor(HasCSSC, {s32, s64})
262 .minScalar(HasCSSC, 0, s32)
263 .clampNumElements(0, v8s8, v16s8)
264 .clampNumElements(0, v4s16, v8s16)
265 .clampNumElements(0, v2s32, v4s32)
266 .lower();
267
268 // FIXME: Legal vector types are only legal with NEON.
269 getActionDefinitionsBuilder(G_ABS)
270 .legalFor(HasCSSC, {s32, s64})
271 .legalFor(PackedVectorAllTypeList)
272 .customIf([=](const LegalityQuery &Q) {
273 // TODO: Fix suboptimal codegen for 128+ bit types.
274 LLT SrcTy = Q.Types[0];
275 return SrcTy.isScalar() && SrcTy.getSizeInBits() < 128;
276 })
277 .widenScalarIf(
278 [=](const LegalityQuery &Query) { return Query.Types[0] == v4s8; },
279 [=](const LegalityQuery &Query) { return std::make_pair(0, v4s16); })
280 .widenScalarIf(
281 [=](const LegalityQuery &Query) { return Query.Types[0] == v2s16; },
282 [=](const LegalityQuery &Query) { return std::make_pair(0, v2s32); })
283 .clampNumElements(0, v8s8, v16s8)
284 .clampNumElements(0, v4s16, v8s16)
285 .clampNumElements(0, v2s32, v4s32)
286 .clampNumElements(0, v2s64, v2s64)
287 .moreElementsToNextPow2(0)
288 .lower();
289
290 getActionDefinitionsBuilder(
291 {G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO})
292 .legalFor({{s32, s32}, {s64, s32}})
293 .clampScalar(0, s32, s64)
294 .clampScalar(1, s32, s64)
295 .widenScalarToNextPow2(0);
296
297 getActionDefinitionsBuilder({G_FSHL, G_FSHR})
298 .customFor({{s32, s32}, {s32, s64}, {s64, s64}})
299 .lower();
300
301 getActionDefinitionsBuilder(G_ROTR)
302 .legalFor({{s32, s64}, {s64, s64}})
303 .customIf([=](const LegalityQuery &Q) {
304 return Q.Types[0].isScalar() && Q.Types[1].getScalarSizeInBits() < 64;
305 })
306 .lower();
307 getActionDefinitionsBuilder(G_ROTL).lower();
308
309 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
310 .customFor({{s32, s32}, {s64, s64}});
311
312 auto always = [=](const LegalityQuery &Q) { return true; };
313 getActionDefinitionsBuilder(G_CTPOP)
314 .legalFor(HasCSSC, {{s32, s32}, {s64, s64}})
315 .legalFor({{v8s8, v8s8}, {v16s8, v16s8}})
316 .customFor(!HasCSSC, {{s32, s32}, {s64, s64}})
317 .customFor({{s128, s128},
318 {v4s16, v4s16},
319 {v8s16, v8s16},
320 {v2s32, v2s32},
321 {v4s32, v4s32},
322 {v2s64, v2s64}})
323 .clampScalar(0, s32, s128)
324 .widenScalarToNextPow2(0)
325 .minScalarEltSameAsIf(always, 1, 0)
326 .maxScalarEltSameAsIf(always, 1, 0)
327 .clampNumElements(0, v8s8, v16s8)
328 .clampNumElements(0, v4s16, v8s16)
329 .clampNumElements(0, v2s32, v4s32)
330 .clampNumElements(0, v2s64, v2s64)
331 .moreElementsToNextPow2(0)
332 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
333
334 getActionDefinitionsBuilder(G_CTLZ)
335 .legalFor({{s32, s32},
336 {s64, s64},
337 {v8s8, v8s8},
338 {v16s8, v16s8},
339 {v4s16, v4s16},
340 {v8s16, v8s16},
341 {v2s32, v2s32},
342 {v4s32, v4s32}})
343 .widenScalarToNextPow2(1, /*Min=*/32)
344 .clampScalar(1, s32, s64)
345 .clampNumElements(0, v8s8, v16s8)
346 .clampNumElements(0, v4s16, v8s16)
347 .clampNumElements(0, v2s32, v4s32)
348 .moreElementsToNextPow2(0)
349 .scalarizeIf(scalarOrEltWiderThan(0, 32), 0)
350 .scalarSameSizeAs(0, 1);
351
352 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).lower();
353
354 getActionDefinitionsBuilder(G_CTTZ)
355 .lowerIf(isVector(0))
356 .widenScalarToNextPow2(1, /*Min=*/32)
357 .clampScalar(1, s32, s64)
358 .scalarSameSizeAs(0, 1)
359 .legalFor(HasCSSC, {s32, s64})
360 .customFor(!HasCSSC, {s32, s64});
361
362 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF).lower();
363
364 getActionDefinitionsBuilder(G_BITREVERSE)
365 .legalFor({s32, s64, v8s8, v16s8})
366 .widenScalarToNextPow2(0, /*Min = */ 32)
367 .widenScalarOrEltToNextPow2OrMinSize(0, 8)
368 .clampScalar(0, s32, s64)
369 .clampNumElements(0, v8s8, v16s8)
370 .clampNumElements(0, v4s16, v8s16)
371 .clampNumElements(0, v2s32, v4s32)
372 .clampNumElements(0, v2s64, v2s64)
373 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
374 .moreElementsToNextPow2(0)
375 .lower();
376
377 getActionDefinitionsBuilder(G_BSWAP)
378 .legalFor({s32, s64, v4s16, v8s16, v2s32, v4s32, v2s64})
379 .widenScalarOrEltToNextPow2(0, 16)
380 .clampScalar(0, s32, s64)
381 .clampNumElements(0, v4s16, v8s16)
382 .clampNumElements(0, v2s32, v4s32)
383 .clampNumElements(0, v2s64, v2s64)
384 .moreElementsToNextPow2(0);
385
386 getActionDefinitionsBuilder({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT})
387 .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32, v2s64})
388 .legalFor(HasSVE, {nxv16s8, nxv8s16, nxv4s32, nxv2s64})
389 .clampNumElements(0, v8s8, v16s8)
390 .clampNumElements(0, v4s16, v8s16)
391 .clampNumElements(0, v2s32, v4s32)
392 .clampMaxNumElements(0, s64, 2)
393 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
394 .moreElementsToNextPow2(0)
395 .lower();
396
397 getActionDefinitionsBuilder(
398 {G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FMA, G_FSQRT, G_FMAXNUM, G_FMINNUM,
399 G_FMAXIMUM, G_FMINIMUM, G_FCEIL, G_FFLOOR, G_FRINT, G_FNEARBYINT,
400 G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
401 .legalFor({s32, s64, v2s32, v4s32, v2s64})
402 .legalFor(HasFP16, {s16, v4s16, v8s16})
403 .libcallFor({s128})
404 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
405 .minScalarOrElt(0, MinFPScalar)
406 .clampNumElements(0, v4s16, v8s16)
407 .clampNumElements(0, v2s32, v4s32)
408 .clampNumElements(0, v2s64, v2s64)
409 .moreElementsToNextPow2(0);
410
411 getActionDefinitionsBuilder({G_FABS, G_FNEG})
412 .legalFor({s32, s64, v2s32, v4s32, v2s64})
413 .legalFor(HasFP16, {s16, v4s16, v8s16})
414 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
415 .lowerIf(scalarOrEltWiderThan(0, 64))
416 .clampNumElements(0, v4s16, v8s16)
417 .clampNumElements(0, v2s32, v4s32)
418 .clampNumElements(0, v2s64, v2s64)
419 .moreElementsToNextPow2(0)
420 .lowerFor({s16, v4s16, v8s16});
421
422 getActionDefinitionsBuilder(G_FREM)
423 .libcallFor({s32, s64, s128})
424 .minScalar(0, s32)
425 .scalarize(0);
426
427 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
428 .legalFor({{s64, MinFPScalar}, {s64, s32}, {s64, s64}})
429 .libcallFor({{s64, s128}})
430 .minScalarOrElt(1, MinFPScalar);
431
432 getActionDefinitionsBuilder({G_FCOS, G_FSIN, G_FPOW, G_FLOG, G_FLOG2,
433 G_FLOG10, G_FTAN, G_FEXP, G_FEXP2, G_FEXP10,
434 G_FACOS, G_FASIN, G_FATAN, G_FATAN2, G_FCOSH,
435 G_FSINH, G_FTANH})
436 // We need a call for these, so we always need to scalarize.
437 .scalarize(0)
438 // Regardless of FP16 support, widen 16-bit elements to 32-bits.
439 .minScalar(0, s32)
440 .libcallFor({s32, s64, s128});
441 getActionDefinitionsBuilder(G_FPOWI)
442 .scalarize(0)
443 .minScalar(0, s32)
444 .libcallFor({{s32, s32}, {s64, s32}, {s128, s32}});
445
446 // TODO: Libcall support for s128.
447 // TODO: s16 should be legal with full FP16 support.
448 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
449 .legalFor({{s64, s32}, {s64, s64}});
450
451 // TODO: Custom legalization for mismatched types.
452 getActionDefinitionsBuilder(G_FCOPYSIGN)
453 .moreElementsIf(
454 [](const LegalityQuery &Query) { return Query.Types[0].isScalar(); },
455 [=](const LegalityQuery &Query) {
456 const LLT Ty = Query.Types[0];
457 return std::pair(0, LLT::fixed_vector(Ty == s16 ? 4 : 2, Ty));
458 })
459 .lower();
460
461 getActionDefinitionsBuilder(G_FMAD).lower();
462
463 for (unsigned Op : {G_SEXTLOAD, G_ZEXTLOAD}) {
464 auto &Actions = getActionDefinitionsBuilder(Op);
465
466 if (Op == G_SEXTLOAD)
467 Actions.lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered));
468
469 // Atomics have zero extending behavior.
470 Actions
471 .legalForTypesWithMemDesc({{s32, p0, s8, 8},
472 {s32, p0, s16, 8},
473 {s32, p0, s32, 8},
474 {s64, p0, s8, 2},
475 {s64, p0, s16, 2},
476 {s64, p0, s32, 4},
477 {s64, p0, s64, 8},
478 {p0, p0, s64, 8},
479 {v2s32, p0, s64, 8}})
480 .widenScalarToNextPow2(0)
481 .clampScalar(0, s32, s64)
482 // TODO: We could support sum-of-pow2's but the lowering code doesn't know
483 // how to do that yet.
484 .unsupportedIfMemSizeNotPow2()
485 // Lower anything left over into G_*EXT and G_LOAD
486 .lower();
487 }
488
489 auto IsPtrVecPred = [=](const LegalityQuery &Query) {
490 const LLT &ValTy = Query.Types[0];
491 return ValTy.isPointerVector() && ValTy.getAddressSpace() == 0;
492 };
493
494 getActionDefinitionsBuilder(G_LOAD)
495 .customIf([=](const LegalityQuery &Query) {
496 return HasRCPC3 && Query.Types[0] == s128 &&
497 Query.MMODescrs[0].Ordering == AtomicOrdering::Acquire;
498 })
499 .customIf([=](const LegalityQuery &Query) {
500 return Query.Types[0] == s128 &&
501 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
502 })
503 .legalForTypesWithMemDesc({{s8, p0, s8, 8},
504 {s16, p0, s16, 8},
505 {s32, p0, s32, 8},
506 {s64, p0, s64, 8},
507 {p0, p0, s64, 8},
508 {s128, p0, s128, 8},
509 {v8s8, p0, s64, 8},
510 {v16s8, p0, s128, 8},
511 {v4s16, p0, s64, 8},
512 {v8s16, p0, s128, 8},
513 {v2s32, p0, s64, 8},
514 {v4s32, p0, s128, 8},
515 {v2s64, p0, s128, 8}})
516 // These extends are also legal
517 .legalForTypesWithMemDesc(
518 {{s32, p0, s8, 8}, {s32, p0, s16, 8}, {s64, p0, s32, 8}})
519 .legalForTypesWithMemDesc({
520 // SVE vscale x 128 bit base sizes
521 {nxv16s8, p0, nxv16s8, 8},
522 {nxv8s16, p0, nxv8s16, 8},
523 {nxv4s32, p0, nxv4s32, 8},
524 {nxv2s64, p0, nxv2s64, 8},
525 })
526 .widenScalarToNextPow2(0, /* MinSize = */ 8)
527 .clampMaxNumElements(0, s8, 16)
528 .clampMaxNumElements(0, s16, 8)
529 .clampMaxNumElements(0, s32, 4)
530 .clampMaxNumElements(0, s64, 2)
531 .clampMaxNumElements(0, p0, 2)
532 .lowerIfMemSizeNotByteSizePow2()
533 .clampScalar(0, s8, s64)
534 .narrowScalarIf(
535 [=](const LegalityQuery &Query) {
536 // Clamp extending load results to 32-bits.
537 return Query.Types[0].isScalar() &&
538 Query.Types[0] != Query.MMODescrs[0].MemoryTy &&
539 Query.Types[0].getSizeInBits() > 32;
540 },
541 changeTo(0, s32))
542 // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out
543 .bitcastIf(typeInSet(0, {v4s8}),
544 [=](const LegalityQuery &Query) {
545 const LLT VecTy = Query.Types[0];
546 return std::pair(0, LLT::scalar(VecTy.getSizeInBits()));
547 })
548 .customIf(IsPtrVecPred)
549 .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0)
550 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
551
552 getActionDefinitionsBuilder(G_STORE)
553 .customIf([=](const LegalityQuery &Query) {
554 return HasRCPC3 && Query.Types[0] == s128 &&
555 Query.MMODescrs[0].Ordering == AtomicOrdering::Release;
556 })
557 .customIf([=](const LegalityQuery &Query) {
558 return Query.Types[0] == s128 &&
559 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
560 })
561 .legalForTypesWithMemDesc(
562 {{s8, p0, s8, 8}, {s16, p0, s8, 8}, // truncstorei8 from s16
563 {s32, p0, s8, 8}, // truncstorei8 from s32
564 {s64, p0, s8, 8}, // truncstorei8 from s64
565 {s16, p0, s16, 8}, {s32, p0, s16, 8}, // truncstorei16 from s32
566 {s64, p0, s16, 8}, // truncstorei16 from s64
567 {s32, p0, s8, 8}, {s32, p0, s16, 8}, {s32, p0, s32, 8},
568 {s64, p0, s64, 8}, {s64, p0, s32, 8}, // truncstorei32 from s64
569 {p0, p0, s64, 8}, {s128, p0, s128, 8}, {v16s8, p0, s128, 8},
570 {v8s8, p0, s64, 8}, {v4s16, p0, s64, 8}, {v8s16, p0, s128, 8},
571 {v2s32, p0, s64, 8}, {v4s32, p0, s128, 8}, {v2s64, p0, s128, 8}})
572 .legalForTypesWithMemDesc({
573 // SVE vscale x 128 bit base sizes
574 // TODO: Add nxv2p0. Consider bitcastIf.
575 // See #92130
576 // https://github.com/llvm/llvm-project/pull/92130#discussion_r1616888461
577 {nxv16s8, p0, nxv16s8, 8},
578 {nxv8s16, p0, nxv8s16, 8},
579 {nxv4s32, p0, nxv4s32, 8},
580 {nxv2s64, p0, nxv2s64, 8},
581 })
582 .clampScalar(0, s8, s64)
583 .minScalarOrElt(0, s8)
584 .lowerIf([=](const LegalityQuery &Query) {
585 return Query.Types[0].isScalar() &&
586 Query.Types[0] != Query.MMODescrs[0].MemoryTy;
587 })
588 // Maximum: sN * k = 128
589 .clampMaxNumElements(0, s8, 16)
590 .clampMaxNumElements(0, s16, 8)
591 .clampMaxNumElements(0, s32, 4)
592 .clampMaxNumElements(0, s64, 2)
593 .clampMaxNumElements(0, p0, 2)
594 .lowerIfMemSizeNotPow2()
595 // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out
596 .bitcastIf(all(typeInSet(0, {v4s8}),
597 LegalityPredicate([=](const LegalityQuery &Query) {
598 return Query.Types[0].getSizeInBits() ==
599 Query.MMODescrs[0].MemoryTy.getSizeInBits();
600 })),
601 [=](const LegalityQuery &Query) {
602 const LLT VecTy = Query.Types[0];
603 return std::pair(0, LLT::scalar(VecTy.getSizeInBits()));
604 })
605 .customIf(IsPtrVecPred)
606 .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0)
607 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
608 .lower();
609
610 getActionDefinitionsBuilder(G_INDEXED_STORE)
611 // Idx 0 == Ptr, Idx 1 == Val
612 // TODO: we can implement legalizations but as of now these are
613 // generated in a very specific way.
614 .legalForTypesWithMemDesc({
615 {p0, s8, s8, 8},
616 {p0, s16, s16, 8},
617 {p0, s32, s8, 8},
618 {p0, s32, s16, 8},
619 {p0, s32, s32, 8},
620 {p0, s64, s64, 8},
621 {p0, p0, p0, 8},
622 {p0, v8s8, v8s8, 8},
623 {p0, v16s8, v16s8, 8},
624 {p0, v4s16, v4s16, 8},
625 {p0, v8s16, v8s16, 8},
626 {p0, v2s32, v2s32, 8},
627 {p0, v4s32, v4s32, 8},
628 {p0, v2s64, v2s64, 8},
629 {p0, v2p0, v2p0, 8},
630 {p0, s128, s128, 8},
631 })
632 .unsupported();
633
634 auto IndexedLoadBasicPred = [=](const LegalityQuery &Query) {
635 LLT LdTy = Query.Types[0];
636 LLT PtrTy = Query.Types[1];
637 if (!llvm::is_contained(PackedVectorAllTypesVec, LdTy) &&
638 !llvm::is_contained(ScalarAndPtrTypesVec, LdTy) && LdTy != s128)
639 return false;
640 if (PtrTy != p0)
641 return false;
642 return true;
643 };
644 getActionDefinitionsBuilder(G_INDEXED_LOAD)
645 .unsupportedIf(
646 atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered))
647 .legalIf(IndexedLoadBasicPred)
648 .unsupported();
649 getActionDefinitionsBuilder({G_INDEXED_SEXTLOAD, G_INDEXED_ZEXTLOAD})
650 .unsupportedIf(
651 atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered))
652 .legalIf(all(typeInSet(0, {s16, s32, s64}),
653 LegalityPredicate([=](const LegalityQuery &Q) {
654 LLT LdTy = Q.Types[0];
655 LLT PtrTy = Q.Types[1];
656 LLT MemTy = Q.MMODescrs[0].MemoryTy;
657 if (PtrTy != p0)
658 return false;
659 if (LdTy == s16)
660 return MemTy == s8;
661 if (LdTy == s32)
662 return MemTy == s8 || MemTy == s16;
663 if (LdTy == s64)
664 return MemTy == s8 || MemTy == s16 || MemTy == s32;
665 return false;
666 })))
667 .unsupported();
668
669 // Constants
670 getActionDefinitionsBuilder(G_CONSTANT)
671 .legalFor({p0, s8, s16, s32, s64})
672 .widenScalarToNextPow2(0)
673 .clampScalar(0, s8, s64);
674 getActionDefinitionsBuilder(G_FCONSTANT)
675 .legalFor({s32, s64, s128})
676 .legalFor(HasFP16, {s16})
677 .clampScalar(0, MinFPScalar, s128);
678
679 // FIXME: fix moreElementsToNextPow2
680 getActionDefinitionsBuilder(G_ICMP)
681 .legalFor({{s32, s32}, {s32, s64}, {s32, p0}})
682 .widenScalarOrEltToNextPow2(1)
683 .clampScalar(1, s32, s64)
684 .clampScalar(0, s32, s32)
685 .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
686 .minScalarEltSameAsIf(
687 [=](const LegalityQuery &Query) {
688 const LLT &Ty = Query.Types[0];
689 const LLT &SrcTy = Query.Types[1];
690 return Ty.isVector() && !SrcTy.isPointerVector() &&
691 Ty.getElementType() != SrcTy.getElementType();
692 },
693 0, 1)
694 .minScalarOrEltIf(
695 [=](const LegalityQuery &Query) { return Query.Types[1] == v2s16; },
696 1, s32)
697 .minScalarOrEltIf(
698 [=](const LegalityQuery &Query) {
699 return Query.Types[1].isPointerVector();
700 },
701 0, s64)
702 .moreElementsToNextPow2(1)
703 .clampNumElements(1, v8s8, v16s8)
704 .clampNumElements(1, v4s16, v8s16)
705 .clampNumElements(1, v2s32, v4s32)
706 .clampNumElements(1, v2s64, v2s64)
707 .clampNumElements(1, v2p0, v2p0)
708 .customIf(isVector(0));
709
710 getActionDefinitionsBuilder(G_FCMP)
711 .legalFor({{s32, s32},
712 {s32, s64},
713 {v4s32, v4s32},
714 {v2s32, v2s32},
715 {v2s64, v2s64}})
716 .legalFor(HasFP16, {{s32, s16}, {v4s16, v4s16}, {v8s16, v8s16}})
717 .widenScalarOrEltToNextPow2(1)
718 .clampScalar(0, s32, s32)
719 .minScalarOrElt(1, MinFPScalar)
720 .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
721 .minScalarEltSameAsIf(
722 [=](const LegalityQuery &Query) {
723 const LLT &Ty = Query.Types[0];
724 const LLT &SrcTy = Query.Types[1];
725 return Ty.isVector() && !SrcTy.isPointerVector() &&
726 Ty.getElementType() != SrcTy.getElementType();
727 },
728 0, 1)
729 .clampNumElements(1, v4s16, v8s16)
730 .clampNumElements(1, v2s32, v4s32)
731 .clampMaxNumElements(1, s64, 2)
732 .moreElementsToNextPow2(1)
733 .libcallFor({{s32, s128}});
734
735 // Extensions
736 auto ExtLegalFunc = [=](const LegalityQuery &Query) {
737 unsigned DstSize = Query.Types[0].getSizeInBits();
738
739 // Handle legal vectors using legalFor
740 if (Query.Types[0].isVector())
741 return false;
742
743 if (DstSize < 8 || DstSize >= 128 || !isPowerOf2_32(DstSize))
744 return false; // Extending to a scalar s128 needs narrowing.
745
746 const LLT &SrcTy = Query.Types[1];
747
748 // Make sure we fit in a register otherwise. Don't bother checking that
749 // the source type is below 128 bits. We shouldn't be allowing anything
750 // through which is wider than the destination in the first place.
751 unsigned SrcSize = SrcTy.getSizeInBits();
752 if (SrcSize < 8 || !isPowerOf2_32(SrcSize))
753 return false;
754
755 return true;
756 };
757 getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
758 .legalIf(ExtLegalFunc)
759 .legalFor({{v8s16, v8s8}, {v4s32, v4s16}, {v2s64, v2s32}})
760 .clampScalar(0, s64, s64) // Just for s128, others are handled above.
761 .moreElementsToNextPow2(0)
762 .clampMaxNumElements(1, s8, 8)
763 .clampMaxNumElements(1, s16, 4)
764 .clampMaxNumElements(1, s32, 2)
765 // Tries to convert a large EXTEND into two smaller EXTENDs
766 .lowerIf([=](const LegalityQuery &Query) {
767 return (Query.Types[0].getScalarSizeInBits() >
768 Query.Types[1].getScalarSizeInBits() * 2) &&
769 Query.Types[0].isVector() &&
770 (Query.Types[1].getScalarSizeInBits() == 8 ||
771 Query.Types[1].getScalarSizeInBits() == 16);
772 })
773 .clampMinNumElements(1, s8, 8)
774 .clampMinNumElements(1, s16, 4)
775 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
776
777 getActionDefinitionsBuilder(G_TRUNC)
778 .legalFor({{v8s8, v8s16}, {v4s16, v4s32}, {v2s32, v2s64}})
779 .moreElementsToNextPow2(0)
780 .clampMaxNumElements(0, s8, 8)
781 .clampMaxNumElements(0, s16, 4)
782 .clampMaxNumElements(0, s32, 2)
783 .minScalarOrEltIf(
784 [=](const LegalityQuery &Query) { return Query.Types[0].isVector(); },
785 0, s8)
786 .lowerIf([=](const LegalityQuery &Query) {
787 LLT DstTy = Query.Types[0];
788 LLT SrcTy = Query.Types[1];
789 return DstTy.isVector() && SrcTy.getSizeInBits() > 128 &&
790 DstTy.getScalarSizeInBits() * 2 <= SrcTy.getScalarSizeInBits();
791 })
792 .clampMinNumElements(0, s8, 8)
793 .clampMinNumElements(0, s16, 4)
794 .alwaysLegal();
795
796 getActionDefinitionsBuilder(G_SEXT_INREG)
797 .legalFor({s32, s64})
798 .legalFor(PackedVectorAllTypeList)
799 .maxScalar(0, s64)
800 .clampNumElements(0, v8s8, v16s8)
801 .clampNumElements(0, v4s16, v8s16)
802 .clampNumElements(0, v2s32, v4s32)
803 .clampMaxNumElements(0, s64, 2)
804 .lower();
805
806 // FP conversions
807 getActionDefinitionsBuilder(G_FPTRUNC)
808 .legalFor(
809 {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}})
810 .libcallFor({{s16, s128}, {s32, s128}, {s64, s128}})
811 .clampNumElements(0, v4s16, v4s16)
812 .clampNumElements(0, v2s32, v2s32)
813 .scalarize(0);
814
815 getActionDefinitionsBuilder(G_FPEXT)
816 .legalFor(
817 {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}})
818 .libcallFor({{s128, s64}, {s128, s32}, {s128, s16}})
819 .clampNumElements(0, v4s32, v4s32)
820 .clampNumElements(0, v2s64, v2s64)
821 .scalarize(0);
822
823 // Conversions
824 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
825 .legalFor({{s32, s32},
826 {s64, s32},
827 {s32, s64},
828 {s64, s64},
829 {v2s32, v2s32},
830 {v4s32, v4s32},
831 {v2s64, v2s64}})
832 .legalFor(HasFP16,
833 {{s32, s16}, {s64, s16}, {v4s16, v4s16}, {v8s16, v8s16}})
834 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
835 .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
836 // The range of a fp16 value fits into an i17, so we can lower the width
837 // to i64.
838 .narrowScalarIf(
839 [=](const LegalityQuery &Query) {
840 return Query.Types[1] == s16 && Query.Types[0].getSizeInBits() > 64;
841 },
842 changeTo(0, s64))
843 .moreElementsToNextPow2(0)
844 .widenScalarOrEltToNextPow2OrMinSize(0)
845 .minScalar(0, s32)
846 .widenScalarOrEltToNextPow2OrMinSize(1, /*MinSize=*/HasFP16 ? 16 : 32)
847 .widenScalarIf(
848 [=](const LegalityQuery &Query) {
849 return Query.Types[0].getScalarSizeInBits() <= 64 &&
850 Query.Types[0].getScalarSizeInBits() >
851 Query.Types[1].getScalarSizeInBits();
852 },
853 LegalizeMutations::changeElementSizeTo(1, 0))
854 .widenScalarIf(
855 [=](const LegalityQuery &Query) {
856 return Query.Types[1].getScalarSizeInBits() <= 64 &&
857 Query.Types[0].getScalarSizeInBits() <
858 Query.Types[1].getScalarSizeInBits();
859 },
860 LegalizeMutations::changeElementSizeTo(0, 1))
861 .clampNumElements(0, v4s16, v8s16)
862 .clampNumElements(0, v2s32, v4s32)
863 .clampMaxNumElements(0, s64, 2)
864 .libcallFor(
865 {{s32, s128}, {s64, s128}, {s128, s128}, {s128, s32}, {s128, s64}});
866
867 getActionDefinitionsBuilder({G_FPTOSI_SAT, G_FPTOUI_SAT})
868 .legalFor({{s32, s32},
869 {s64, s32},
870 {s32, s64},
871 {s64, s64},
872 {v2s32, v2s32},
873 {v4s32, v4s32},
874 {v2s64, v2s64}})
875 .legalFor(HasFP16,
876 {{s32, s16}, {s64, s16}, {v4s16, v4s16}, {v8s16, v8s16}})
877 // Handle types larger than i64 by scalarizing/lowering.
878 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
879 .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
880 // The range of a fp16 value fits into an i17, so we can lower the width
881 // to i64.
882 .narrowScalarIf(
883 [=](const LegalityQuery &Query) {
884 return Query.Types[1] == s16 && Query.Types[0].getSizeInBits() > 64;
885 },
886 changeTo(0, s64))
887 .lowerIf(::any(scalarWiderThan(0, 64), scalarWiderThan(1, 64)), 0)
888 .moreElementsToNextPow2(0)
889 .widenScalarToNextPow2(0, /*MinSize=*/32)
890 .minScalar(0, s32)
891 .widenScalarOrEltToNextPow2OrMinSize(1, /*MinSize=*/HasFP16 ? 16 : 32)
892 .widenScalarIf(
893 [=](const LegalityQuery &Query) {
894 unsigned ITySize = Query.Types[0].getScalarSizeInBits();
895 return (ITySize == 16 || ITySize == 32 || ITySize == 64) &&
896 ITySize > Query.Types[1].getScalarSizeInBits();
897 },
898 LegalizeMutations::changeElementSizeTo(1, 0))
899 .widenScalarIf(
900 [=](const LegalityQuery &Query) {
901 unsigned FTySize = Query.Types[1].getScalarSizeInBits();
902 return (FTySize == 16 || FTySize == 32 || FTySize == 64) &&
903 Query.Types[0].getScalarSizeInBits() < FTySize;
904 },
905 LegalizeMutations::changeElementSizeTo(0, 1))
906 .widenScalarOrEltToNextPow2(0)
907 .clampNumElements(0, v4s16, v8s16)
908 .clampNumElements(0, v2s32, v4s32)
909 .clampMaxNumElements(0, s64, 2);
910
911 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
912 .legalFor({{s32, s32},
913 {s64, s32},
914 {s32, s64},
915 {s64, s64},
916 {v2s32, v2s32},
917 {v4s32, v4s32},
918 {v2s64, v2s64}})
919 .legalFor(HasFP16,
920 {{s16, s32}, {s16, s64}, {v4s16, v4s16}, {v8s16, v8s16}})
921 .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
922 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
923 .moreElementsToNextPow2(1)
924 .widenScalarOrEltToNextPow2OrMinSize(1)
925 .minScalar(1, s32)
926 .lowerIf([](const LegalityQuery &Query) {
927 return Query.Types[1].isVector() &&
928 Query.Types[1].getScalarSizeInBits() == 64 &&
929 Query.Types[0].getScalarSizeInBits() == 16;
930 })
931 .widenScalarOrEltToNextPow2OrMinSize(0, /*MinSize=*/HasFP16 ? 16 : 32)
932 .scalarizeIf(
933 // v2i64->v2f32 needs to scalarize to avoid double-rounding issues.
934 [](const LegalityQuery &Query) {
935 return Query.Types[0].getScalarSizeInBits() == 32 &&
936 Query.Types[1].getScalarSizeInBits() == 64;
937 },
938 0)
939 .widenScalarIf(
940 [](const LegalityQuery &Query) {
941 return Query.Types[1].getScalarSizeInBits() <= 64 &&
942 Query.Types[0].getScalarSizeInBits() <
943 Query.Types[1].getScalarSizeInBits();
944 },
945 LegalizeMutations::changeElementSizeTo(0, 1))
946 .widenScalarIf(
947 [](const LegalityQuery &Query) {
948 return Query.Types[0].getScalarSizeInBits() <= 64 &&
949 Query.Types[0].getScalarSizeInBits() >
950 Query.Types[1].getScalarSizeInBits();
951 },
952 LegalizeMutations::changeElementSizeTo(1, 0))
953 .clampNumElements(0, v4s16, v8s16)
954 .clampNumElements(0, v2s32, v4s32)
955 .clampMaxNumElements(0, s64, 2)
956 .libcallFor({{s16, s128},
957 {s32, s128},
958 {s64, s128},
959 {s128, s128},
960 {s128, s32},
961 {s128, s64}});
962
963 // Control-flow
964 getActionDefinitionsBuilder(G_BRCOND)
965 .legalFor({s32})
966 .clampScalar(0, s32, s32);
967 getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0});
968
969 getActionDefinitionsBuilder(G_SELECT)
970 .legalFor({{s32, s32}, {s64, s32}, {p0, s32}})
971 .widenScalarToNextPow2(0)
972 .clampScalar(0, s32, s64)
973 .clampScalar(1, s32, s32)
974 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
975 .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0)
976 .lowerIf(isVector(0));
977
978 // Pointer-handling
979 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
980
981 if (TM.getCodeModel() == CodeModel::Small)
982 getActionDefinitionsBuilder(G_GLOBAL_VALUE).custom();
983 else
984 getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
985
986 getActionDefinitionsBuilder(G_PTRAUTH_GLOBAL_VALUE)
987 .legalIf(all(typeIs(0, p0), typeIs(1, p0)));
988
989 getActionDefinitionsBuilder(G_PTRTOINT)
990 .legalFor({{s64, p0}, {v2s64, v2p0}})
991 .widenScalarToNextPow2(0, 64)
992 .clampScalar(0, s64, s64)
993 .clampMaxNumElements(0, s64, 2);
994
995 getActionDefinitionsBuilder(G_INTTOPTR)
996 .unsupportedIf([&](const LegalityQuery &Query) {
997 return Query.Types[0].getSizeInBits() != Query.Types[1].getSizeInBits();
998 })
999 .legalFor({{p0, s64}, {v2p0, v2s64}})
1000 .clampMaxNumElements(1, s64, 2);
1001
1002 // Casts for 32 and 64-bit width type are just copies.
1003 // Same for 128-bit width type, except they are on the FPR bank.
1004 getActionDefinitionsBuilder(G_BITCAST)
1005 // Keeping 32-bit instructions legal to prevent regression in some tests
1006 .legalForCartesianProduct({s32, v2s16, v4s8})
1007 .legalForCartesianProduct({s64, v8s8, v4s16, v2s32})
1008 .legalForCartesianProduct({s128, v16s8, v8s16, v4s32, v2s64, v2p0})
1009 .customIf([=](const LegalityQuery &Query) {
1010 // Handle casts from i1 vectors to scalars.
1011 LLT DstTy = Query.Types[0];
1012 LLT SrcTy = Query.Types[1];
1013 return DstTy.isScalar() && SrcTy.isVector() &&
1014 SrcTy.getScalarSizeInBits() == 1;
1015 })
1016 .lowerIf([=](const LegalityQuery &Query) {
1017 return Query.Types[0].isVector() != Query.Types[1].isVector();
1018 })
1019 .moreElementsToNextPow2(0)
1020 .clampNumElements(0, v8s8, v16s8)
1021 .clampNumElements(0, v4s16, v8s16)
1022 .clampNumElements(0, v2s32, v4s32)
1023 .lower();
1024
1025 getActionDefinitionsBuilder(G_VASTART).legalFor({p0});
1026
1027 // va_list must be a pointer, but most sized types are pretty easy to handle
1028 // as the destination.
1029 getActionDefinitionsBuilder(G_VAARG)
1030 .customForCartesianProduct({s8, s16, s32, s64, p0}, {p0})
1031 .clampScalar(0, s8, s64)
1032 .widenScalarToNextPow2(0, /*Min*/ 8);
1033
1034 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
1035 .lowerIf(
1036 all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0)));
1037
1038 bool UseOutlineAtomics = ST.outlineAtomics() && !ST.hasLSE();
1039
1040 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1041 .legalFor(!UseOutlineAtomics, {{s32, p0}, {s64, p0}})
1042 .customFor(!UseOutlineAtomics, {{s128, p0}})
1043 .libcallFor(UseOutlineAtomics,
1044 {{s8, p0}, {s16, p0}, {s32, p0}, {s64, p0}, {s128, p0}})
1045 .clampScalar(0, s32, s64);
1046
1047 getActionDefinitionsBuilder({G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD,
1048 G_ATOMICRMW_SUB, G_ATOMICRMW_AND, G_ATOMICRMW_OR,
1049 G_ATOMICRMW_XOR})
1050 .legalFor(!UseOutlineAtomics, {{s32, p0}, {s64, p0}})
1051 .libcallFor(UseOutlineAtomics,
1052 {{s8, p0}, {s16, p0}, {s32, p0}, {s64, p0}})
1053 .clampScalar(0, s32, s64);
1054
1055 // Do not outline these atomics operations, as per comment in
1056 // AArch64ISelLowering.cpp's shouldExpandAtomicRMWInIR().
1057 getActionDefinitionsBuilder(
1058 {G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX})
1059 .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)))
1060 .clampScalar(0, s32, s64);
1061
1062 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({p0});
1063
1064 // Merge/Unmerge
1065 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1066 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1067 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1068 getActionDefinitionsBuilder(Op)
1069 .widenScalarToNextPow2(LitTyIdx, 8)
1070 .widenScalarToNextPow2(BigTyIdx, 32)
1071 .clampScalar(LitTyIdx, s8, s64)
1072 .clampScalar(BigTyIdx, s32, s128)
1073 .legalIf([=](const LegalityQuery &Q) {
1074 switch (Q.Types[BigTyIdx].getSizeInBits()) {
1075 case 32:
1076 case 64:
1077 case 128:
1078 break;
1079 default:
1080 return false;
1081 }
1082 switch (Q.Types[LitTyIdx].getSizeInBits()) {
1083 case 8:
1084 case 16:
1085 case 32:
1086 case 64:
1087 return true;
1088 default:
1089 return false;
1090 }
1091 });
1092 }
1093
1094 // TODO : nxv4s16, nxv2s16, nxv2s32
1095 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1096 .legalFor(HasSVE, {{s16, nxv16s8, s64},
1097 {s16, nxv8s16, s64},
1098 {s32, nxv4s32, s64},
1099 {s64, nxv2s64, s64}})
1100 .unsupportedIf([=](const LegalityQuery &Query) {
1101 const LLT &EltTy = Query.Types[1].getElementType();
1102 if (Query.Types[1].isScalableVector())
1103 return false;
1104 return Query.Types[0] != EltTy;
1105 })
1106 .minScalar(2, s64)
1107 .customIf([=](const LegalityQuery &Query) {
1108 const LLT &VecTy = Query.Types[1];
1109 return VecTy == v8s8 || VecTy == v16s8 || VecTy == v2s16 ||
1110 VecTy == v4s16 || VecTy == v8s16 || VecTy == v2s32 ||
1111 VecTy == v4s32 || VecTy == v2s64 || VecTy == v2p0;
1112 })
1113 .minScalarOrEltIf(
1114 [=](const LegalityQuery &Query) {
1115 // We want to promote to <M x s1> to <M x s64> if that wouldn't
1116 // cause the total vec size to be > 128b.
1117 return Query.Types[1].isFixedVector() &&
1118 Query.Types[1].getNumElements() <= 2;
1119 },
1120 0, s64)
1121 .minScalarOrEltIf(
1122 [=](const LegalityQuery &Query) {
1123 return Query.Types[1].isFixedVector() &&
1124 Query.Types[1].getNumElements() <= 4;
1125 },
1126 0, s32)
1127 .minScalarOrEltIf(
1128 [=](const LegalityQuery &Query) {
1129 return Query.Types[1].isFixedVector() &&
1130 Query.Types[1].getNumElements() <= 8;
1131 },
1132 0, s16)
1133 .minScalarOrEltIf(
1134 [=](const LegalityQuery &Query) {
1135 return Query.Types[1].isFixedVector() &&
1136 Query.Types[1].getNumElements() <= 16;
1137 },
1138 0, s8)
1139 .minScalarOrElt(0, s8) // Worst case, we need at least s8.
1140 .moreElementsToNextPow2(1)
1141 .clampMaxNumElements(1, s64, 2)
1142 .clampMaxNumElements(1, s32, 4)
1143 .clampMaxNumElements(1, s16, 8)
1144 .clampMaxNumElements(1, s8, 16)
1145 .clampMaxNumElements(1, p0, 2);
1146
1147 getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT)
1148 .legalIf(
1149 typeInSet(0, {v8s8, v16s8, v4s16, v8s16, v2s32, v4s32, v2s64, v2p0}))
1150 .legalFor(HasSVE, {{nxv16s8, s32, s64},
1151 {nxv8s16, s32, s64},
1152 {nxv4s32, s32, s64},
1153 {nxv2s64, s64, s64}})
1154 .moreElementsToNextPow2(0)
1155 .widenVectorEltsToVectorMinSize(0, 64)
1156 .clampNumElements(0, v8s8, v16s8)
1157 .clampNumElements(0, v4s16, v8s16)
1158 .clampNumElements(0, v2s32, v4s32)
1159 .clampMaxNumElements(0, s64, 2)
1160 .clampMaxNumElements(0, p0, 2);
1161
1162 getActionDefinitionsBuilder(G_BUILD_VECTOR)
1163 .legalFor({{v8s8, s8},
1164 {v16s8, s8},
1165 {v4s16, s16},
1166 {v8s16, s16},
1167 {v2s32, s32},
1168 {v4s32, s32},
1169 {v2s64, s64},
1170 {v2p0, p0}})
1171 .clampNumElements(0, v4s32, v4s32)
1172 .clampNumElements(0, v2s64, v2s64)
1173 .minScalarOrElt(0, s8)
1174 .widenVectorEltsToVectorMinSize(0, 64)
1175 .widenScalarOrEltToNextPow2(0)
1176 .minScalarSameAs(1, 0);
1177
1178 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC).lower();
1179
1180 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1181 .legalIf([=](const LegalityQuery &Query) {
1182 const LLT &DstTy = Query.Types[0];
1183 const LLT &SrcTy = Query.Types[1];
1184 // For now just support the TBL2 variant which needs the source vectors
1185 // to be the same size as the dest.
1186 if (DstTy != SrcTy)
1187 return false;
1188 return llvm::is_contained(
1189 {v8s8, v16s8, v4s16, v8s16, v2s32, v4s32, v2s64}, DstTy);
1190 })
1191 // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors) or scalar
1192 // destinations, we just want those lowered into G_BUILD_VECTOR or
1193 // G_EXTRACT_ELEMENT.
1194 .lowerIf([=](const LegalityQuery &Query) {
1195 return !Query.Types[0].isVector() || !Query.Types[1].isVector();
1196 })
1197 .moreElementsIf(
1198 [](const LegalityQuery &Query) {
1199 return Query.Types[0].isVector() && Query.Types[1].isVector() &&
1200 Query.Types[0].getNumElements() >
1201 Query.Types[1].getNumElements();
1202 },
1203 changeTo(1, 0))
1204 .moreElementsToNextPow2(0)
1205 .moreElementsIf(
1206 [](const LegalityQuery &Query) {
1207 return Query.Types[0].isVector() && Query.Types[1].isVector() &&
1208 Query.Types[0].getNumElements() <
1209 Query.Types[1].getNumElements();
1210 },
1211 changeTo(0, 1))
1212 .widenScalarOrEltToNextPow2OrMinSize(0, 8)
1213 .clampNumElements(0, v8s8, v16s8)
1214 .clampNumElements(0, v4s16, v8s16)
1215 .clampNumElements(0, v4s32, v4s32)
1216 .clampNumElements(0, v2s64, v2s64)
1217 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
1218 .bitcastIf(isPointerVector(0), [=](const LegalityQuery &Query) {
1219 // Bitcast pointers vector to i64.
1220 const LLT DstTy = Query.Types[0];
1221 return std::pair(0, LLT::vector(DstTy.getElementCount(), 64));
1222 });
1223
1224 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1225 .legalFor({{v16s8, v8s8}, {v8s16, v4s16}, {v4s32, v2s32}})
1226 .bitcastIf(
1227 [=](const LegalityQuery &Query) {
1228 return Query.Types[0].getSizeInBits() <= 128 &&
1229 Query.Types[1].getSizeInBits() <= 64;
1230 },
1231 [=](const LegalityQuery &Query) {
1232 const LLT DstTy = Query.Types[0];
1233 const LLT SrcTy = Query.Types[1];
1234 return std::pair(
1235 0, DstTy.changeElementSize(SrcTy.getSizeInBits())
1236 .changeElementCount(
1237 DstTy.getElementCount().divideCoefficientBy(
1238 SrcTy.getNumElements())));
1239 });
1240
1241 getActionDefinitionsBuilder(G_EXTRACT_SUBVECTOR)
1242 .legalFor({{v8s8, v16s8}, {v4s16, v8s16}, {v2s32, v4s32}})
1243 .widenScalarOrEltToNextPow2(0)
1244 .immIdx(0); // Inform verifier imm idx 0 is handled.
1245
1246 // TODO: {nxv16s8, s8}, {nxv8s16, s16}
1247 getActionDefinitionsBuilder(G_SPLAT_VECTOR)
1248 .legalFor(HasSVE, {{nxv4s32, s32}, {nxv2s64, s64}});
1249
1250 getActionDefinitionsBuilder(G_JUMP_TABLE).legalFor({p0});
1251
1252 getActionDefinitionsBuilder(G_BRJT).legalFor({{p0, s64}});
1253
1254 getActionDefinitionsBuilder(G_DYN_STACKALLOC).custom();
1255
1256 getActionDefinitionsBuilder({G_STACKSAVE, G_STACKRESTORE}).lower();
1257
1258 if (ST.hasMOPS()) {
1259 // G_BZERO is not supported. Currently it is only emitted by
1260 // PreLegalizerCombiner for G_MEMSET with zero constant.
1261 getActionDefinitionsBuilder(G_BZERO).unsupported();
1262
1263 getActionDefinitionsBuilder(G_MEMSET)
1264 .legalForCartesianProduct({p0}, {s64}, {s64})
1265 .customForCartesianProduct({p0}, {s8}, {s64})
1266 .immIdx(0); // Inform verifier imm idx 0 is handled.
1267
1268 getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE})
1269 .legalForCartesianProduct({p0}, {p0}, {s64})
1270 .immIdx(0); // Inform verifier imm idx 0 is handled.
1271
1272 // G_MEMCPY_INLINE does not have a tailcall immediate
1273 getActionDefinitionsBuilder(G_MEMCPY_INLINE)
1274 .legalForCartesianProduct({p0}, {p0}, {s64});
1275
1276 } else {
1277 getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET})
1278 .libcall();
1279 }
1280
1281 // For fadd reductions we have pairwise operations available. We treat the
1282 // usual legal types as legal and handle the lowering to pairwise instructions
1283 // later.
1284 getActionDefinitionsBuilder(G_VECREDUCE_FADD)
1285 .legalFor({{s32, v2s32}, {s32, v4s32}, {s64, v2s64}})
1286 .legalFor(HasFP16, {{s16, v4s16}, {s16, v8s16}})
1287 .minScalarOrElt(0, MinFPScalar)
1288 .clampMaxNumElements(1, s64, 2)
1289 .clampMaxNumElements(1, s32, 4)
1290 .clampMaxNumElements(1, s16, 8)
1291 .moreElementsToNextPow2(1)
1292 .scalarize(1)
1293 .lower();
1294
1295 // For fmul reductions we need to split up into individual operations. We
1296 // clamp to 128 bit vectors then to 64bit vectors to produce a cascade of
1297 // smaller types, followed by scalarizing what remains.
1298 getActionDefinitionsBuilder(G_VECREDUCE_FMUL)
1299 .minScalarOrElt(0, MinFPScalar)
1300 .clampMaxNumElements(1, s64, 2)
1301 .clampMaxNumElements(1, s32, 4)
1302 .clampMaxNumElements(1, s16, 8)
1303 .clampMaxNumElements(1, s32, 2)
1304 .clampMaxNumElements(1, s16, 4)
1305 .scalarize(1)
1306 .lower();
1307
1308 getActionDefinitionsBuilder({G_VECREDUCE_SEQ_FADD, G_VECREDUCE_SEQ_FMUL})
1309 .scalarize(2)
1310 .lower();
1311
1312 getActionDefinitionsBuilder(G_VECREDUCE_ADD)
1313 .legalFor({{s8, v8s8},
1314 {s8, v16s8},
1315 {s16, v4s16},
1316 {s16, v8s16},
1317 {s32, v2s32},
1318 {s32, v4s32},
1319 {s64, v2s64}})
1320 .moreElementsToNextPow2(1)
1321 .clampMaxNumElements(1, s64, 2)
1322 .clampMaxNumElements(1, s32, 4)
1323 .clampMaxNumElements(1, s16, 8)
1324 .clampMaxNumElements(1, s8, 16)
1325 .widenVectorEltsToVectorMinSize(1, 64)
1326 .scalarize(1);
1327
1328 getActionDefinitionsBuilder({G_VECREDUCE_FMIN, G_VECREDUCE_FMAX,
1329 G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM})
1330 .legalFor({{s32, v2s32}, {s32, v4s32}, {s64, v2s64}})
1331 .legalFor(HasFP16, {{s16, v4s16}, {s16, v8s16}})
1332 .minScalarOrElt(0, MinFPScalar)
1333 .clampMaxNumElements(1, s64, 2)
1334 .clampMaxNumElements(1, s32, 4)
1335 .clampMaxNumElements(1, s16, 8)
1336 .lower();
1337
1338 getActionDefinitionsBuilder(G_VECREDUCE_MUL)
1339 .clampMaxNumElements(1, s32, 2)
1340 .clampMaxNumElements(1, s16, 4)
1341 .clampMaxNumElements(1, s8, 8)
1342 .scalarize(1)
1343 .lower();
1344
1345 getActionDefinitionsBuilder(
1346 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX})
1347 .legalFor({{s8, v8s8},
1348 {s8, v16s8},
1349 {s16, v4s16},
1350 {s16, v8s16},
1351 {s32, v2s32},
1352 {s32, v4s32}})
1353 .moreElementsIf(
1354 [=](const LegalityQuery &Query) {
1355 return Query.Types[1].isVector() &&
1356 Query.Types[1].getElementType() != s8 &&
1357 Query.Types[1].getNumElements() & 1;
1358 },
1359 LegalizeMutations::moreElementsToNextPow2(1))
1360 .clampMaxNumElements(1, s64, 2)
1361 .clampMaxNumElements(1, s32, 4)
1362 .clampMaxNumElements(1, s16, 8)
1363 .clampMaxNumElements(1, s8, 16)
1364 .scalarize(1)
1365 .lower();
1366
1367 getActionDefinitionsBuilder(
1368 {G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
1369 // Try to break down into smaller vectors as long as they're at least 64
1370 // bits. This lets us use vector operations for some parts of the
1371 // reduction.
1372 .fewerElementsIf(
1373 [=](const LegalityQuery &Q) {
1374 LLT SrcTy = Q.Types[1];
1375 if (SrcTy.isScalar())
1376 return false;
1377 if (!isPowerOf2_32(SrcTy.getNumElements()))
1378 return false;
1379 // We can usually perform 64b vector operations.
1380 return SrcTy.getSizeInBits() > 64;
1381 },
1382 [=](const LegalityQuery &Q) {
1383 LLT SrcTy = Q.Types[1];
1384 return std::make_pair(1, SrcTy.divide(2));
1385 })
1386 .scalarize(1)
1387 .lower();
1388
1389 // TODO: Update this to correct handling when adding AArch64/SVE support.
1390 getActionDefinitionsBuilder(G_VECTOR_COMPRESS).lower();
1391
1392 // Access to floating-point environment.
1393 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV, G_RESET_FPENV,
1394 G_GET_FPMODE, G_SET_FPMODE, G_RESET_FPMODE})
1395 .libcall();
1396
1397 getActionDefinitionsBuilder(G_IS_FPCLASS).lower();
1398
1399 getActionDefinitionsBuilder(G_PREFETCH).custom();
1400
1401 getActionDefinitionsBuilder({G_SCMP, G_UCMP}).lower();
1402
1403 getLegacyLegalizerInfo().computeTables();
1404 verify(*ST.getInstrInfo());
1405 }
1406
legalizeCustom(LegalizerHelper & Helper,MachineInstr & MI,LostDebugLocObserver & LocObserver) const1407 bool AArch64LegalizerInfo::legalizeCustom(
1408 LegalizerHelper &Helper, MachineInstr &MI,
1409 LostDebugLocObserver &LocObserver) const {
1410 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1411 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
1412 GISelChangeObserver &Observer = Helper.Observer;
1413 switch (MI.getOpcode()) {
1414 default:
1415 // No idea what to do.
1416 return false;
1417 case TargetOpcode::G_VAARG:
1418 return legalizeVaArg(MI, MRI, MIRBuilder);
1419 case TargetOpcode::G_LOAD:
1420 case TargetOpcode::G_STORE:
1421 return legalizeLoadStore(MI, MRI, MIRBuilder, Observer);
1422 case TargetOpcode::G_SHL:
1423 case TargetOpcode::G_ASHR:
1424 case TargetOpcode::G_LSHR:
1425 return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer);
1426 case TargetOpcode::G_GLOBAL_VALUE:
1427 return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer);
1428 case TargetOpcode::G_SBFX:
1429 case TargetOpcode::G_UBFX:
1430 return legalizeBitfieldExtract(MI, MRI, Helper);
1431 case TargetOpcode::G_FSHL:
1432 case TargetOpcode::G_FSHR:
1433 return legalizeFunnelShift(MI, MRI, MIRBuilder, Observer, Helper);
1434 case TargetOpcode::G_ROTR:
1435 return legalizeRotate(MI, MRI, Helper);
1436 case TargetOpcode::G_CTPOP:
1437 return legalizeCTPOP(MI, MRI, Helper);
1438 case TargetOpcode::G_ATOMIC_CMPXCHG:
1439 return legalizeAtomicCmpxchg128(MI, MRI, Helper);
1440 case TargetOpcode::G_CTTZ:
1441 return legalizeCTTZ(MI, Helper);
1442 case TargetOpcode::G_BZERO:
1443 case TargetOpcode::G_MEMCPY:
1444 case TargetOpcode::G_MEMMOVE:
1445 case TargetOpcode::G_MEMSET:
1446 return legalizeMemOps(MI, Helper);
1447 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1448 return legalizeExtractVectorElt(MI, MRI, Helper);
1449 case TargetOpcode::G_DYN_STACKALLOC:
1450 return legalizeDynStackAlloc(MI, Helper);
1451 case TargetOpcode::G_PREFETCH:
1452 return legalizePrefetch(MI, Helper);
1453 case TargetOpcode::G_ABS:
1454 return Helper.lowerAbsToCNeg(MI);
1455 case TargetOpcode::G_ICMP:
1456 return legalizeICMP(MI, MRI, MIRBuilder);
1457 case TargetOpcode::G_BITCAST:
1458 return legalizeBitcast(MI, Helper);
1459 }
1460
1461 llvm_unreachable("expected switch to return");
1462 }
1463
legalizeBitcast(MachineInstr & MI,LegalizerHelper & Helper) const1464 bool AArch64LegalizerInfo::legalizeBitcast(MachineInstr &MI,
1465 LegalizerHelper &Helper) const {
1466 assert(MI.getOpcode() == TargetOpcode::G_BITCAST && "Unexpected opcode");
1467 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
1468 // We're trying to handle casts from i1 vectors to scalars but reloading from
1469 // stack.
1470 if (!DstTy.isScalar() || !SrcTy.isVector() ||
1471 SrcTy.getElementType() != LLT::scalar(1))
1472 return false;
1473
1474 Helper.createStackStoreLoad(DstReg, SrcReg);
1475 MI.eraseFromParent();
1476 return true;
1477 }
1478
legalizeFunnelShift(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & MIRBuilder,GISelChangeObserver & Observer,LegalizerHelper & Helper) const1479 bool AArch64LegalizerInfo::legalizeFunnelShift(MachineInstr &MI,
1480 MachineRegisterInfo &MRI,
1481 MachineIRBuilder &MIRBuilder,
1482 GISelChangeObserver &Observer,
1483 LegalizerHelper &Helper) const {
1484 assert(MI.getOpcode() == TargetOpcode::G_FSHL ||
1485 MI.getOpcode() == TargetOpcode::G_FSHR);
1486
1487 // Keep as G_FSHR if shift amount is a G_CONSTANT, else use generic
1488 // lowering
1489 Register ShiftNo = MI.getOperand(3).getReg();
1490 LLT ShiftTy = MRI.getType(ShiftNo);
1491 auto VRegAndVal = getIConstantVRegValWithLookThrough(ShiftNo, MRI);
1492
1493 // Adjust shift amount according to Opcode (FSHL/FSHR)
1494 // Convert FSHL to FSHR
1495 LLT OperationTy = MRI.getType(MI.getOperand(0).getReg());
1496 APInt BitWidth(ShiftTy.getSizeInBits(), OperationTy.getSizeInBits(), false);
1497
1498 // Lower non-constant shifts and leave zero shifts to the optimizer.
1499 if (!VRegAndVal || VRegAndVal->Value.urem(BitWidth) == 0)
1500 return (Helper.lowerFunnelShiftAsShifts(MI) ==
1501 LegalizerHelper::LegalizeResult::Legalized);
1502
1503 APInt Amount = VRegAndVal->Value.urem(BitWidth);
1504
1505 Amount = MI.getOpcode() == TargetOpcode::G_FSHL ? BitWidth - Amount : Amount;
1506
1507 // If the instruction is G_FSHR, has a 64-bit G_CONSTANT for shift amount
1508 // in the range of 0 <-> BitWidth, it is legal
1509 if (ShiftTy.getSizeInBits() == 64 && MI.getOpcode() == TargetOpcode::G_FSHR &&
1510 VRegAndVal->Value.ult(BitWidth))
1511 return true;
1512
1513 // Cast the ShiftNumber to a 64-bit type
1514 auto Cast64 = MIRBuilder.buildConstant(LLT::scalar(64), Amount.zext(64));
1515
1516 if (MI.getOpcode() == TargetOpcode::G_FSHR) {
1517 Observer.changingInstr(MI);
1518 MI.getOperand(3).setReg(Cast64.getReg(0));
1519 Observer.changedInstr(MI);
1520 }
1521 // If Opcode is FSHL, remove the FSHL instruction and create a FSHR
1522 // instruction
1523 else if (MI.getOpcode() == TargetOpcode::G_FSHL) {
1524 MIRBuilder.buildInstr(TargetOpcode::G_FSHR, {MI.getOperand(0).getReg()},
1525 {MI.getOperand(1).getReg(), MI.getOperand(2).getReg(),
1526 Cast64.getReg(0)});
1527 MI.eraseFromParent();
1528 }
1529 return true;
1530 }
1531
legalizeICMP(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & MIRBuilder) const1532 bool AArch64LegalizerInfo::legalizeICMP(MachineInstr &MI,
1533 MachineRegisterInfo &MRI,
1534 MachineIRBuilder &MIRBuilder) const {
1535 Register DstReg = MI.getOperand(0).getReg();
1536 Register SrcReg1 = MI.getOperand(2).getReg();
1537 Register SrcReg2 = MI.getOperand(3).getReg();
1538 LLT DstTy = MRI.getType(DstReg);
1539 LLT SrcTy = MRI.getType(SrcReg1);
1540
1541 // Check the vector types are legal
1542 if (DstTy.getScalarSizeInBits() != SrcTy.getScalarSizeInBits() ||
1543 DstTy.getNumElements() != SrcTy.getNumElements() ||
1544 (DstTy.getSizeInBits() != 64 && DstTy.getSizeInBits() != 128))
1545 return false;
1546
1547 // Lowers G_ICMP NE => G_ICMP EQ to allow better pattern matching for
1548 // following passes
1549 CmpInst::Predicate Pred = (CmpInst::Predicate)MI.getOperand(1).getPredicate();
1550 if (Pred != CmpInst::ICMP_NE)
1551 return true;
1552 Register CmpReg =
1553 MIRBuilder
1554 .buildICmp(CmpInst::ICMP_EQ, MRI.getType(DstReg), SrcReg1, SrcReg2)
1555 .getReg(0);
1556 MIRBuilder.buildNot(DstReg, CmpReg);
1557
1558 MI.eraseFromParent();
1559 return true;
1560 }
1561
legalizeRotate(MachineInstr & MI,MachineRegisterInfo & MRI,LegalizerHelper & Helper) const1562 bool AArch64LegalizerInfo::legalizeRotate(MachineInstr &MI,
1563 MachineRegisterInfo &MRI,
1564 LegalizerHelper &Helper) const {
1565 // To allow for imported patterns to match, we ensure that the rotate amount
1566 // is 64b with an extension.
1567 Register AmtReg = MI.getOperand(2).getReg();
1568 LLT AmtTy = MRI.getType(AmtReg);
1569 (void)AmtTy;
1570 assert(AmtTy.isScalar() && "Expected a scalar rotate");
1571 assert(AmtTy.getSizeInBits() < 64 && "Expected this rotate to be legal");
1572 auto NewAmt = Helper.MIRBuilder.buildZExt(LLT::scalar(64), AmtReg);
1573 Helper.Observer.changingInstr(MI);
1574 MI.getOperand(2).setReg(NewAmt.getReg(0));
1575 Helper.Observer.changedInstr(MI);
1576 return true;
1577 }
1578
legalizeSmallCMGlobalValue(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & MIRBuilder,GISelChangeObserver & Observer) const1579 bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(
1580 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
1581 GISelChangeObserver &Observer) const {
1582 assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE);
1583 // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP +
1584 // G_ADD_LOW instructions.
1585 // By splitting this here, we can optimize accesses in the small code model by
1586 // folding in the G_ADD_LOW into the load/store offset.
1587 auto &GlobalOp = MI.getOperand(1);
1588 // Don't modify an intrinsic call.
1589 if (GlobalOp.isSymbol())
1590 return true;
1591 const auto* GV = GlobalOp.getGlobal();
1592 if (GV->isThreadLocal())
1593 return true; // Don't want to modify TLS vars.
1594
1595 auto &TM = ST->getTargetLowering()->getTargetMachine();
1596 unsigned OpFlags = ST->ClassifyGlobalReference(GV, TM);
1597
1598 if (OpFlags & AArch64II::MO_GOT)
1599 return true;
1600
1601 auto Offset = GlobalOp.getOffset();
1602 Register DstReg = MI.getOperand(0).getReg();
1603 auto ADRP = MIRBuilder.buildInstr(AArch64::ADRP, {LLT::pointer(0, 64)}, {})
1604 .addGlobalAddress(GV, Offset, OpFlags | AArch64II::MO_PAGE);
1605 // Set the regclass on the dest reg too.
1606 MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
1607
1608 // MO_TAGGED on the page indicates a tagged address. Set the tag now. We do so
1609 // by creating a MOVK that sets bits 48-63 of the register to (global address
1610 // + 0x100000000 - PC) >> 48. The additional 0x100000000 offset here is to
1611 // prevent an incorrect tag being generated during relocation when the
1612 // global appears before the code section. Without the offset, a global at
1613 // `0x0f00'0000'0000'1000` (i.e. at `0x1000` with tag `0xf`) that's referenced
1614 // by code at `0x2000` would result in `0x0f00'0000'0000'1000 - 0x2000 =
1615 // 0x0eff'ffff'ffff'f000`, meaning the tag would be incorrectly set to `0xe`
1616 // instead of `0xf`.
1617 // This assumes that we're in the small code model so we can assume a binary
1618 // size of <= 4GB, which makes the untagged PC relative offset positive. The
1619 // binary must also be loaded into address range [0, 2^48). Both of these
1620 // properties need to be ensured at runtime when using tagged addresses.
1621 if (OpFlags & AArch64II::MO_TAGGED) {
1622 assert(!Offset &&
1623 "Should not have folded in an offset for a tagged global!");
1624 ADRP = MIRBuilder.buildInstr(AArch64::MOVKXi, {LLT::pointer(0, 64)}, {ADRP})
1625 .addGlobalAddress(GV, 0x100000000,
1626 AArch64II::MO_PREL | AArch64II::MO_G3)
1627 .addImm(48);
1628 MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
1629 }
1630
1631 MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP})
1632 .addGlobalAddress(GV, Offset,
1633 OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
1634 MI.eraseFromParent();
1635 return true;
1636 }
1637
legalizeIntrinsic(LegalizerHelper & Helper,MachineInstr & MI) const1638 bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
1639 MachineInstr &MI) const {
1640 MachineIRBuilder &MIB = Helper.MIRBuilder;
1641 MachineRegisterInfo &MRI = *MIB.getMRI();
1642
1643 auto LowerBinOp = [&MI, &MIB](unsigned Opcode) {
1644 MIB.buildInstr(Opcode, {MI.getOperand(0)},
1645 {MI.getOperand(2), MI.getOperand(3)});
1646 MI.eraseFromParent();
1647 return true;
1648 };
1649
1650 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
1651 switch (IntrinsicID) {
1652 case Intrinsic::vacopy: {
1653 unsigned PtrSize = ST->isTargetILP32() ? 4 : 8;
1654 unsigned VaListSize =
1655 (ST->isTargetDarwin() || ST->isTargetWindows())
1656 ? PtrSize
1657 : ST->isTargetILP32() ? 20 : 32;
1658
1659 MachineFunction &MF = *MI.getMF();
1660 auto Val = MF.getRegInfo().createGenericVirtualRegister(
1661 LLT::scalar(VaListSize * 8));
1662 MIB.buildLoad(Val, MI.getOperand(2),
1663 *MF.getMachineMemOperand(MachinePointerInfo(),
1664 MachineMemOperand::MOLoad,
1665 VaListSize, Align(PtrSize)));
1666 MIB.buildStore(Val, MI.getOperand(1),
1667 *MF.getMachineMemOperand(MachinePointerInfo(),
1668 MachineMemOperand::MOStore,
1669 VaListSize, Align(PtrSize)));
1670 MI.eraseFromParent();
1671 return true;
1672 }
1673 case Intrinsic::get_dynamic_area_offset: {
1674 MIB.buildConstant(MI.getOperand(0).getReg(), 0);
1675 MI.eraseFromParent();
1676 return true;
1677 }
1678 case Intrinsic::aarch64_mops_memset_tag: {
1679 assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
1680 // Anyext the value being set to 64 bit (only the bottom 8 bits are read by
1681 // the instruction).
1682 auto &Value = MI.getOperand(3);
1683 Register ExtValueReg = MIB.buildAnyExt(LLT::scalar(64), Value).getReg(0);
1684 Value.setReg(ExtValueReg);
1685 return true;
1686 }
1687 case Intrinsic::aarch64_prefetch: {
1688 auto &AddrVal = MI.getOperand(1);
1689
1690 int64_t IsWrite = MI.getOperand(2).getImm();
1691 int64_t Target = MI.getOperand(3).getImm();
1692 int64_t IsStream = MI.getOperand(4).getImm();
1693 int64_t IsData = MI.getOperand(5).getImm();
1694
1695 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
1696 (!IsData << 3) | // IsDataCache bit
1697 (Target << 1) | // Cache level bits
1698 (unsigned)IsStream; // Stream bit
1699
1700 MIB.buildInstr(AArch64::G_AARCH64_PREFETCH).addImm(PrfOp).add(AddrVal);
1701 MI.eraseFromParent();
1702 return true;
1703 }
1704 case Intrinsic::aarch64_neon_uaddv:
1705 case Intrinsic::aarch64_neon_saddv:
1706 case Intrinsic::aarch64_neon_umaxv:
1707 case Intrinsic::aarch64_neon_smaxv:
1708 case Intrinsic::aarch64_neon_uminv:
1709 case Intrinsic::aarch64_neon_sminv: {
1710 bool IsSigned = IntrinsicID == Intrinsic::aarch64_neon_saddv ||
1711 IntrinsicID == Intrinsic::aarch64_neon_smaxv ||
1712 IntrinsicID == Intrinsic::aarch64_neon_sminv;
1713
1714 auto OldDst = MI.getOperand(0).getReg();
1715 auto OldDstTy = MRI.getType(OldDst);
1716 LLT NewDstTy = MRI.getType(MI.getOperand(2).getReg()).getElementType();
1717 if (OldDstTy == NewDstTy)
1718 return true;
1719
1720 auto NewDst = MRI.createGenericVirtualRegister(NewDstTy);
1721
1722 Helper.Observer.changingInstr(MI);
1723 MI.getOperand(0).setReg(NewDst);
1724 Helper.Observer.changedInstr(MI);
1725
1726 MIB.setInsertPt(MIB.getMBB(), ++MIB.getInsertPt());
1727 MIB.buildExtOrTrunc(IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT,
1728 OldDst, NewDst);
1729
1730 return true;
1731 }
1732 case Intrinsic::aarch64_neon_uaddlp:
1733 case Intrinsic::aarch64_neon_saddlp: {
1734 unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlp
1735 ? AArch64::G_UADDLP
1736 : AArch64::G_SADDLP;
1737 MIB.buildInstr(Opc, {MI.getOperand(0)}, {MI.getOperand(2)});
1738 MI.eraseFromParent();
1739
1740 return true;
1741 }
1742 case Intrinsic::aarch64_neon_uaddlv:
1743 case Intrinsic::aarch64_neon_saddlv: {
1744 unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlv
1745 ? AArch64::G_UADDLV
1746 : AArch64::G_SADDLV;
1747 Register DstReg = MI.getOperand(0).getReg();
1748 Register SrcReg = MI.getOperand(2).getReg();
1749 LLT DstTy = MRI.getType(DstReg);
1750
1751 LLT MidTy, ExtTy;
1752 if (DstTy.isScalar() && DstTy.getScalarSizeInBits() <= 32) {
1753 MidTy = LLT::fixed_vector(4, 32);
1754 ExtTy = LLT::scalar(32);
1755 } else {
1756 MidTy = LLT::fixed_vector(2, 64);
1757 ExtTy = LLT::scalar(64);
1758 }
1759
1760 Register MidReg =
1761 MIB.buildInstr(Opc, {MidTy}, {SrcReg})->getOperand(0).getReg();
1762 Register ZeroReg =
1763 MIB.buildConstant(LLT::scalar(64), 0)->getOperand(0).getReg();
1764 Register ExtReg = MIB.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT, {ExtTy},
1765 {MidReg, ZeroReg})
1766 .getReg(0);
1767
1768 if (DstTy.getScalarSizeInBits() < 32)
1769 MIB.buildTrunc(DstReg, ExtReg);
1770 else
1771 MIB.buildCopy(DstReg, ExtReg);
1772
1773 MI.eraseFromParent();
1774
1775 return true;
1776 }
1777 case Intrinsic::aarch64_neon_smax:
1778 return LowerBinOp(TargetOpcode::G_SMAX);
1779 case Intrinsic::aarch64_neon_smin:
1780 return LowerBinOp(TargetOpcode::G_SMIN);
1781 case Intrinsic::aarch64_neon_umax:
1782 return LowerBinOp(TargetOpcode::G_UMAX);
1783 case Intrinsic::aarch64_neon_umin:
1784 return LowerBinOp(TargetOpcode::G_UMIN);
1785 case Intrinsic::aarch64_neon_fmax:
1786 return LowerBinOp(TargetOpcode::G_FMAXIMUM);
1787 case Intrinsic::aarch64_neon_fmin:
1788 return LowerBinOp(TargetOpcode::G_FMINIMUM);
1789 case Intrinsic::aarch64_neon_fmaxnm:
1790 return LowerBinOp(TargetOpcode::G_FMAXNUM);
1791 case Intrinsic::aarch64_neon_fminnm:
1792 return LowerBinOp(TargetOpcode::G_FMINNUM);
1793 case Intrinsic::aarch64_neon_smull:
1794 return LowerBinOp(AArch64::G_SMULL);
1795 case Intrinsic::aarch64_neon_umull:
1796 return LowerBinOp(AArch64::G_UMULL);
1797 case Intrinsic::aarch64_neon_abs: {
1798 // Lower the intrinsic to G_ABS.
1799 MIB.buildInstr(TargetOpcode::G_ABS, {MI.getOperand(0)}, {MI.getOperand(2)});
1800 MI.eraseFromParent();
1801 return true;
1802 }
1803 case Intrinsic::aarch64_neon_sqadd: {
1804 if (MRI.getType(MI.getOperand(0).getReg()).isVector())
1805 return LowerBinOp(TargetOpcode::G_SADDSAT);
1806 break;
1807 }
1808 case Intrinsic::aarch64_neon_sqsub: {
1809 if (MRI.getType(MI.getOperand(0).getReg()).isVector())
1810 return LowerBinOp(TargetOpcode::G_SSUBSAT);
1811 break;
1812 }
1813 case Intrinsic::aarch64_neon_uqadd: {
1814 if (MRI.getType(MI.getOperand(0).getReg()).isVector())
1815 return LowerBinOp(TargetOpcode::G_UADDSAT);
1816 break;
1817 }
1818 case Intrinsic::aarch64_neon_uqsub: {
1819 if (MRI.getType(MI.getOperand(0).getReg()).isVector())
1820 return LowerBinOp(TargetOpcode::G_USUBSAT);
1821 break;
1822 }
1823
1824 case Intrinsic::vector_reverse:
1825 // TODO: Add support for vector_reverse
1826 return false;
1827 }
1828
1829 return true;
1830 }
1831
legalizeShlAshrLshr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & MIRBuilder,GISelChangeObserver & Observer) const1832 bool AArch64LegalizerInfo::legalizeShlAshrLshr(
1833 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
1834 GISelChangeObserver &Observer) const {
1835 assert(MI.getOpcode() == TargetOpcode::G_ASHR ||
1836 MI.getOpcode() == TargetOpcode::G_LSHR ||
1837 MI.getOpcode() == TargetOpcode::G_SHL);
1838 // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the
1839 // imported patterns can select it later. Either way, it will be legal.
1840 Register AmtReg = MI.getOperand(2).getReg();
1841 auto VRegAndVal = getIConstantVRegValWithLookThrough(AmtReg, MRI);
1842 if (!VRegAndVal)
1843 return true;
1844 // Check the shift amount is in range for an immediate form.
1845 int64_t Amount = VRegAndVal->Value.getSExtValue();
1846 if (Amount > 31)
1847 return true; // This will have to remain a register variant.
1848 auto ExtCst = MIRBuilder.buildConstant(LLT::scalar(64), Amount);
1849 Observer.changingInstr(MI);
1850 MI.getOperand(2).setReg(ExtCst.getReg(0));
1851 Observer.changedInstr(MI);
1852 return true;
1853 }
1854
matchLDPSTPAddrMode(Register Root,Register & Base,int & Offset,MachineRegisterInfo & MRI)1855 static void matchLDPSTPAddrMode(Register Root, Register &Base, int &Offset,
1856 MachineRegisterInfo &MRI) {
1857 Base = Root;
1858 Offset = 0;
1859
1860 Register NewBase;
1861 int64_t NewOffset;
1862 if (mi_match(Root, MRI, m_GPtrAdd(m_Reg(NewBase), m_ICst(NewOffset))) &&
1863 isShiftedInt<7, 3>(NewOffset)) {
1864 Base = NewBase;
1865 Offset = NewOffset;
1866 }
1867 }
1868
1869 // FIXME: This should be removed and replaced with the generic bitcast legalize
1870 // action.
legalizeLoadStore(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & MIRBuilder,GISelChangeObserver & Observer) const1871 bool AArch64LegalizerInfo::legalizeLoadStore(
1872 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
1873 GISelChangeObserver &Observer) const {
1874 assert(MI.getOpcode() == TargetOpcode::G_STORE ||
1875 MI.getOpcode() == TargetOpcode::G_LOAD);
1876 // Here we just try to handle vector loads/stores where our value type might
1877 // have pointer elements, which the SelectionDAG importer can't handle. To
1878 // allow the existing patterns for s64 to fire for p0, we just try to bitcast
1879 // the value to use s64 types.
1880
1881 // Custom legalization requires the instruction, if not deleted, must be fully
1882 // legalized. In order to allow further legalization of the inst, we create
1883 // a new instruction and erase the existing one.
1884
1885 Register ValReg = MI.getOperand(0).getReg();
1886 const LLT ValTy = MRI.getType(ValReg);
1887
1888 if (ValTy == LLT::scalar(128)) {
1889
1890 AtomicOrdering Ordering = (*MI.memoperands_begin())->getSuccessOrdering();
1891 bool IsLoad = MI.getOpcode() == TargetOpcode::G_LOAD;
1892 bool IsLoadAcquire = IsLoad && Ordering == AtomicOrdering::Acquire;
1893 bool IsStoreRelease = !IsLoad && Ordering == AtomicOrdering::Release;
1894 bool IsRcpC3 =
1895 ST->hasLSE2() && ST->hasRCPC3() && (IsLoadAcquire || IsStoreRelease);
1896
1897 LLT s64 = LLT::scalar(64);
1898
1899 unsigned Opcode;
1900 if (IsRcpC3) {
1901 Opcode = IsLoad ? AArch64::LDIAPPX : AArch64::STILPX;
1902 } else {
1903 // For LSE2, loads/stores should have been converted to monotonic and had
1904 // a fence inserted after them.
1905 assert(Ordering == AtomicOrdering::Monotonic ||
1906 Ordering == AtomicOrdering::Unordered);
1907 assert(ST->hasLSE2() && "ldp/stp not single copy atomic without +lse2");
1908
1909 Opcode = IsLoad ? AArch64::LDPXi : AArch64::STPXi;
1910 }
1911
1912 MachineInstrBuilder NewI;
1913 if (IsLoad) {
1914 NewI = MIRBuilder.buildInstr(Opcode, {s64, s64}, {});
1915 MIRBuilder.buildMergeLikeInstr(
1916 ValReg, {NewI->getOperand(0), NewI->getOperand(1)});
1917 } else {
1918 auto Split = MIRBuilder.buildUnmerge(s64, MI.getOperand(0));
1919 NewI = MIRBuilder.buildInstr(
1920 Opcode, {}, {Split->getOperand(0), Split->getOperand(1)});
1921 }
1922
1923 if (IsRcpC3) {
1924 NewI.addUse(MI.getOperand(1).getReg());
1925 } else {
1926 Register Base;
1927 int Offset;
1928 matchLDPSTPAddrMode(MI.getOperand(1).getReg(), Base, Offset, MRI);
1929 NewI.addUse(Base);
1930 NewI.addImm(Offset / 8);
1931 }
1932
1933 NewI.cloneMemRefs(MI);
1934 constrainSelectedInstRegOperands(*NewI, *ST->getInstrInfo(),
1935 *MRI.getTargetRegisterInfo(),
1936 *ST->getRegBankInfo());
1937 MI.eraseFromParent();
1938 return true;
1939 }
1940
1941 if (!ValTy.isPointerVector() ||
1942 ValTy.getElementType().getAddressSpace() != 0) {
1943 LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store");
1944 return false;
1945 }
1946
1947 unsigned PtrSize = ValTy.getElementType().getSizeInBits();
1948 const LLT NewTy = LLT::vector(ValTy.getElementCount(), PtrSize);
1949 auto &MMO = **MI.memoperands_begin();
1950 MMO.setType(NewTy);
1951
1952 if (MI.getOpcode() == TargetOpcode::G_STORE) {
1953 auto Bitcast = MIRBuilder.buildBitcast(NewTy, ValReg);
1954 MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1), MMO);
1955 } else {
1956 auto NewLoad = MIRBuilder.buildLoad(NewTy, MI.getOperand(1), MMO);
1957 MIRBuilder.buildBitcast(ValReg, NewLoad);
1958 }
1959 MI.eraseFromParent();
1960 return true;
1961 }
1962
legalizeVaArg(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & MIRBuilder) const1963 bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
1964 MachineRegisterInfo &MRI,
1965 MachineIRBuilder &MIRBuilder) const {
1966 MachineFunction &MF = MIRBuilder.getMF();
1967 Align Alignment(MI.getOperand(2).getImm());
1968 Register Dst = MI.getOperand(0).getReg();
1969 Register ListPtr = MI.getOperand(1).getReg();
1970
1971 LLT PtrTy = MRI.getType(ListPtr);
1972 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1973
1974 const unsigned PtrSize = PtrTy.getSizeInBits() / 8;
1975 const Align PtrAlign = Align(PtrSize);
1976 auto List = MIRBuilder.buildLoad(
1977 PtrTy, ListPtr,
1978 *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
1979 PtrTy, PtrAlign));
1980
1981 MachineInstrBuilder DstPtr;
1982 if (Alignment > PtrAlign) {
1983 // Realign the list to the actual required alignment.
1984 auto AlignMinus1 =
1985 MIRBuilder.buildConstant(IntPtrTy, Alignment.value() - 1);
1986 auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0));
1987 DstPtr = MIRBuilder.buildMaskLowPtrBits(PtrTy, ListTmp, Log2(Alignment));
1988 } else
1989 DstPtr = List;
1990
1991 LLT ValTy = MRI.getType(Dst);
1992 uint64_t ValSize = ValTy.getSizeInBits() / 8;
1993 MIRBuilder.buildLoad(
1994 Dst, DstPtr,
1995 *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
1996 ValTy, std::max(Alignment, PtrAlign)));
1997
1998 auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrAlign));
1999
2000 auto NewList = MIRBuilder.buildPtrAdd(PtrTy, DstPtr, Size.getReg(0));
2001
2002 MIRBuilder.buildStore(NewList, ListPtr,
2003 *MF.getMachineMemOperand(MachinePointerInfo(),
2004 MachineMemOperand::MOStore,
2005 PtrTy, PtrAlign));
2006
2007 MI.eraseFromParent();
2008 return true;
2009 }
2010
legalizeBitfieldExtract(MachineInstr & MI,MachineRegisterInfo & MRI,LegalizerHelper & Helper) const2011 bool AArch64LegalizerInfo::legalizeBitfieldExtract(
2012 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
2013 // Only legal if we can select immediate forms.
2014 // TODO: Lower this otherwise.
2015 return getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI) &&
2016 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2017 }
2018
legalizeCTPOP(MachineInstr & MI,MachineRegisterInfo & MRI,LegalizerHelper & Helper) const2019 bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI,
2020 MachineRegisterInfo &MRI,
2021 LegalizerHelper &Helper) const {
2022 // When there is no integer popcount instruction (FEAT_CSSC isn't available),
2023 // it can be more efficiently lowered to the following sequence that uses
2024 // AdvSIMD registers/instructions as long as the copies to/from the AdvSIMD
2025 // registers are cheap.
2026 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
2027 // CNT V0.8B, V0.8B // 8xbyte pop-counts
2028 // ADDV B0, V0.8B // sum 8xbyte pop-counts
2029 // UMOV X0, V0.B[0] // copy byte result back to integer reg
2030 //
2031 // For 128 bit vector popcounts, we lower to the following sequence:
2032 // cnt.16b v0, v0 // v8s16, v4s32, v2s64
2033 // uaddlp.8h v0, v0 // v8s16, v4s32, v2s64
2034 // uaddlp.4s v0, v0 // v4s32, v2s64
2035 // uaddlp.2d v0, v0 // v2s64
2036 //
2037 // For 64 bit vector popcounts, we lower to the following sequence:
2038 // cnt.8b v0, v0 // v4s16, v2s32
2039 // uaddlp.4h v0, v0 // v4s16, v2s32
2040 // uaddlp.2s v0, v0 // v2s32
2041
2042 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2043 Register Dst = MI.getOperand(0).getReg();
2044 Register Val = MI.getOperand(1).getReg();
2045 LLT Ty = MRI.getType(Val);
2046 unsigned Size = Ty.getSizeInBits();
2047
2048 assert(Ty == MRI.getType(Dst) &&
2049 "Expected src and dst to have the same type!");
2050
2051 if (ST->hasCSSC() && Ty.isScalar() && Size == 128) {
2052 LLT s64 = LLT::scalar(64);
2053
2054 auto Split = MIRBuilder.buildUnmerge(s64, Val);
2055 auto CTPOP1 = MIRBuilder.buildCTPOP(s64, Split->getOperand(0));
2056 auto CTPOP2 = MIRBuilder.buildCTPOP(s64, Split->getOperand(1));
2057 auto Add = MIRBuilder.buildAdd(s64, CTPOP1, CTPOP2);
2058
2059 MIRBuilder.buildZExt(Dst, Add);
2060 MI.eraseFromParent();
2061 return true;
2062 }
2063
2064 if (!ST->hasNEON() ||
2065 MI.getMF()->getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) {
2066 // Use generic lowering when custom lowering is not possible.
2067 return Ty.isScalar() && (Size == 32 || Size == 64) &&
2068 Helper.lowerBitCount(MI) ==
2069 LegalizerHelper::LegalizeResult::Legalized;
2070 }
2071
2072 // Pre-conditioning: widen Val up to the nearest vector type.
2073 // s32,s64,v4s16,v2s32 -> v8i8
2074 // v8s16,v4s32,v2s64 -> v16i8
2075 LLT VTy = Size == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8);
2076 if (Ty.isScalar()) {
2077 assert((Size == 32 || Size == 64 || Size == 128) && "Expected only 32, 64, or 128 bit scalars!");
2078 if (Size == 32) {
2079 Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0);
2080 }
2081 }
2082 Val = MIRBuilder.buildBitcast(VTy, Val).getReg(0);
2083
2084 // Count bits in each byte-sized lane.
2085 auto CTPOP = MIRBuilder.buildCTPOP(VTy, Val);
2086
2087 // Sum across lanes.
2088
2089 if (ST->hasDotProd() && Ty.isVector() && Ty.getNumElements() >= 2 &&
2090 Ty.getScalarSizeInBits() != 16) {
2091 LLT Dt = Ty == LLT::fixed_vector(2, 64) ? LLT::fixed_vector(4, 32) : Ty;
2092 auto Zeros = MIRBuilder.buildConstant(Dt, 0);
2093 auto Ones = MIRBuilder.buildConstant(VTy, 1);
2094 MachineInstrBuilder Sum;
2095
2096 if (Ty == LLT::fixed_vector(2, 64)) {
2097 auto UDOT =
2098 MIRBuilder.buildInstr(AArch64::G_UDOT, {Dt}, {Zeros, Ones, CTPOP});
2099 Sum = MIRBuilder.buildInstr(AArch64::G_UADDLP, {Ty}, {UDOT});
2100 } else if (Ty == LLT::fixed_vector(4, 32)) {
2101 Sum = MIRBuilder.buildInstr(AArch64::G_UDOT, {Dt}, {Zeros, Ones, CTPOP});
2102 } else if (Ty == LLT::fixed_vector(2, 32)) {
2103 Sum = MIRBuilder.buildInstr(AArch64::G_UDOT, {Dt}, {Zeros, Ones, CTPOP});
2104 } else {
2105 llvm_unreachable("unexpected vector shape");
2106 }
2107
2108 Sum->getOperand(0).setReg(Dst);
2109 MI.eraseFromParent();
2110 return true;
2111 }
2112
2113 Register HSum = CTPOP.getReg(0);
2114 unsigned Opc;
2115 SmallVector<LLT> HAddTys;
2116 if (Ty.isScalar()) {
2117 Opc = Intrinsic::aarch64_neon_uaddlv;
2118 HAddTys.push_back(LLT::scalar(32));
2119 } else if (Ty == LLT::fixed_vector(8, 16)) {
2120 Opc = Intrinsic::aarch64_neon_uaddlp;
2121 HAddTys.push_back(LLT::fixed_vector(8, 16));
2122 } else if (Ty == LLT::fixed_vector(4, 32)) {
2123 Opc = Intrinsic::aarch64_neon_uaddlp;
2124 HAddTys.push_back(LLT::fixed_vector(8, 16));
2125 HAddTys.push_back(LLT::fixed_vector(4, 32));
2126 } else if (Ty == LLT::fixed_vector(2, 64)) {
2127 Opc = Intrinsic::aarch64_neon_uaddlp;
2128 HAddTys.push_back(LLT::fixed_vector(8, 16));
2129 HAddTys.push_back(LLT::fixed_vector(4, 32));
2130 HAddTys.push_back(LLT::fixed_vector(2, 64));
2131 } else if (Ty == LLT::fixed_vector(4, 16)) {
2132 Opc = Intrinsic::aarch64_neon_uaddlp;
2133 HAddTys.push_back(LLT::fixed_vector(4, 16));
2134 } else if (Ty == LLT::fixed_vector(2, 32)) {
2135 Opc = Intrinsic::aarch64_neon_uaddlp;
2136 HAddTys.push_back(LLT::fixed_vector(4, 16));
2137 HAddTys.push_back(LLT::fixed_vector(2, 32));
2138 } else
2139 llvm_unreachable("unexpected vector shape");
2140 MachineInstrBuilder UADD;
2141 for (LLT HTy : HAddTys) {
2142 UADD = MIRBuilder.buildIntrinsic(Opc, {HTy}).addUse(HSum);
2143 HSum = UADD.getReg(0);
2144 }
2145
2146 // Post-conditioning.
2147 if (Ty.isScalar() && (Size == 64 || Size == 128))
2148 MIRBuilder.buildZExt(Dst, UADD);
2149 else
2150 UADD->getOperand(0).setReg(Dst);
2151 MI.eraseFromParent();
2152 return true;
2153 }
2154
legalizeAtomicCmpxchg128(MachineInstr & MI,MachineRegisterInfo & MRI,LegalizerHelper & Helper) const2155 bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128(
2156 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
2157 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2158 LLT s64 = LLT::scalar(64);
2159 auto Addr = MI.getOperand(1).getReg();
2160 auto DesiredI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(2));
2161 auto NewI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(3));
2162 auto DstLo = MRI.createGenericVirtualRegister(s64);
2163 auto DstHi = MRI.createGenericVirtualRegister(s64);
2164
2165 MachineInstrBuilder CAS;
2166 if (ST->hasLSE()) {
2167 // We have 128-bit CASP instructions taking XSeqPair registers, which are
2168 // s128. We need the merge/unmerge to bracket the expansion and pair up with
2169 // the rest of the MIR so we must reassemble the extracted registers into a
2170 // 128-bit known-regclass one with code like this:
2171 //
2172 // %in1 = REG_SEQUENCE Lo, Hi ; One for each input
2173 // %out = CASP %in1, ...
2174 // %OldLo = G_EXTRACT %out, 0
2175 // %OldHi = G_EXTRACT %out, 64
2176 auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
2177 unsigned Opcode;
2178 switch (Ordering) {
2179 case AtomicOrdering::Acquire:
2180 Opcode = AArch64::CASPAX;
2181 break;
2182 case AtomicOrdering::Release:
2183 Opcode = AArch64::CASPLX;
2184 break;
2185 case AtomicOrdering::AcquireRelease:
2186 case AtomicOrdering::SequentiallyConsistent:
2187 Opcode = AArch64::CASPALX;
2188 break;
2189 default:
2190 Opcode = AArch64::CASPX;
2191 break;
2192 }
2193
2194 LLT s128 = LLT::scalar(128);
2195 auto CASDst = MRI.createGenericVirtualRegister(s128);
2196 auto CASDesired = MRI.createGenericVirtualRegister(s128);
2197 auto CASNew = MRI.createGenericVirtualRegister(s128);
2198 MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASDesired}, {})
2199 .addUse(DesiredI->getOperand(0).getReg())
2200 .addImm(AArch64::sube64)
2201 .addUse(DesiredI->getOperand(1).getReg())
2202 .addImm(AArch64::subo64);
2203 MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASNew}, {})
2204 .addUse(NewI->getOperand(0).getReg())
2205 .addImm(AArch64::sube64)
2206 .addUse(NewI->getOperand(1).getReg())
2207 .addImm(AArch64::subo64);
2208
2209 CAS = MIRBuilder.buildInstr(Opcode, {CASDst}, {CASDesired, CASNew, Addr});
2210
2211 MIRBuilder.buildExtract({DstLo}, {CASDst}, 0);
2212 MIRBuilder.buildExtract({DstHi}, {CASDst}, 64);
2213 } else {
2214 // The -O0 CMP_SWAP_128 is friendlier to generate code for because LDXP/STXP
2215 // can take arbitrary registers so it just has the normal GPR64 operands the
2216 // rest of AArch64 is expecting.
2217 auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
2218 unsigned Opcode;
2219 switch (Ordering) {
2220 case AtomicOrdering::Acquire:
2221 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
2222 break;
2223 case AtomicOrdering::Release:
2224 Opcode = AArch64::CMP_SWAP_128_RELEASE;
2225 break;
2226 case AtomicOrdering::AcquireRelease:
2227 case AtomicOrdering::SequentiallyConsistent:
2228 Opcode = AArch64::CMP_SWAP_128;
2229 break;
2230 default:
2231 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
2232 break;
2233 }
2234
2235 auto Scratch = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
2236 CAS = MIRBuilder.buildInstr(Opcode, {DstLo, DstHi, Scratch},
2237 {Addr, DesiredI->getOperand(0),
2238 DesiredI->getOperand(1), NewI->getOperand(0),
2239 NewI->getOperand(1)});
2240 }
2241
2242 CAS.cloneMemRefs(MI);
2243 constrainSelectedInstRegOperands(*CAS, *ST->getInstrInfo(),
2244 *MRI.getTargetRegisterInfo(),
2245 *ST->getRegBankInfo());
2246
2247 MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {DstLo, DstHi});
2248 MI.eraseFromParent();
2249 return true;
2250 }
2251
legalizeCTTZ(MachineInstr & MI,LegalizerHelper & Helper) const2252 bool AArch64LegalizerInfo::legalizeCTTZ(MachineInstr &MI,
2253 LegalizerHelper &Helper) const {
2254 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2255 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
2256 LLT Ty = MRI.getType(MI.getOperand(1).getReg());
2257 auto BitReverse = MIRBuilder.buildBitReverse(Ty, MI.getOperand(1));
2258 MIRBuilder.buildCTLZ(MI.getOperand(0).getReg(), BitReverse);
2259 MI.eraseFromParent();
2260 return true;
2261 }
2262
legalizeMemOps(MachineInstr & MI,LegalizerHelper & Helper) const2263 bool AArch64LegalizerInfo::legalizeMemOps(MachineInstr &MI,
2264 LegalizerHelper &Helper) const {
2265 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2266
2267 // Tagged version MOPSMemorySetTagged is legalised in legalizeIntrinsic
2268 if (MI.getOpcode() == TargetOpcode::G_MEMSET) {
2269 // Anyext the value being set to 64 bit (only the bottom 8 bits are read by
2270 // the instruction).
2271 auto &Value = MI.getOperand(1);
2272 Register ExtValueReg =
2273 MIRBuilder.buildAnyExt(LLT::scalar(64), Value).getReg(0);
2274 Value.setReg(ExtValueReg);
2275 return true;
2276 }
2277
2278 return false;
2279 }
2280
legalizeExtractVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,LegalizerHelper & Helper) const2281 bool AArch64LegalizerInfo::legalizeExtractVectorElt(
2282 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
2283 const GExtractVectorElement *Element = cast<GExtractVectorElement>(&MI);
2284 auto VRegAndVal =
2285 getIConstantVRegValWithLookThrough(Element->getIndexReg(), MRI);
2286 if (VRegAndVal)
2287 return true;
2288 LLT VecTy = MRI.getType(Element->getVectorReg());
2289 if (VecTy.isScalableVector())
2290 return true;
2291 return Helper.lowerExtractInsertVectorElt(MI) !=
2292 LegalizerHelper::LegalizeResult::UnableToLegalize;
2293 }
2294
legalizeDynStackAlloc(MachineInstr & MI,LegalizerHelper & Helper) const2295 bool AArch64LegalizerInfo::legalizeDynStackAlloc(
2296 MachineInstr &MI, LegalizerHelper &Helper) const {
2297 MachineFunction &MF = *MI.getParent()->getParent();
2298 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2299 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
2300
2301 // If stack probing is not enabled for this function, use the default
2302 // lowering.
2303 if (!MF.getFunction().hasFnAttribute("probe-stack") ||
2304 MF.getFunction().getFnAttribute("probe-stack").getValueAsString() !=
2305 "inline-asm") {
2306 Helper.lowerDynStackAlloc(MI);
2307 return true;
2308 }
2309
2310 Register Dst = MI.getOperand(0).getReg();
2311 Register AllocSize = MI.getOperand(1).getReg();
2312 Align Alignment = assumeAligned(MI.getOperand(2).getImm());
2313
2314 assert(MRI.getType(Dst) == LLT::pointer(0, 64) &&
2315 "Unexpected type for dynamic alloca");
2316 assert(MRI.getType(AllocSize) == LLT::scalar(64) &&
2317 "Unexpected type for dynamic alloca");
2318
2319 LLT PtrTy = MRI.getType(Dst);
2320 Register SPReg =
2321 Helper.getTargetLowering().getStackPointerRegisterToSaveRestore();
2322 Register SPTmp =
2323 Helper.getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy);
2324 auto NewMI =
2325 MIRBuilder.buildInstr(AArch64::PROBED_STACKALLOC_DYN, {}, {SPTmp});
2326 MRI.setRegClass(NewMI.getReg(0), &AArch64::GPR64commonRegClass);
2327 MIRBuilder.setInsertPt(*NewMI->getParent(), NewMI);
2328 MIRBuilder.buildCopy(Dst, SPTmp);
2329
2330 MI.eraseFromParent();
2331 return true;
2332 }
2333
legalizePrefetch(MachineInstr & MI,LegalizerHelper & Helper) const2334 bool AArch64LegalizerInfo::legalizePrefetch(MachineInstr &MI,
2335 LegalizerHelper &Helper) const {
2336 MachineIRBuilder &MIB = Helper.MIRBuilder;
2337 auto &AddrVal = MI.getOperand(0);
2338
2339 int64_t IsWrite = MI.getOperand(1).getImm();
2340 int64_t Locality = MI.getOperand(2).getImm();
2341 int64_t IsData = MI.getOperand(3).getImm();
2342
2343 bool IsStream = Locality == 0;
2344 if (Locality != 0) {
2345 assert(Locality <= 3 && "Prefetch locality out-of-range");
2346 // The locality degree is the opposite of the cache speed.
2347 // Put the number the other way around.
2348 // The encoding starts at 0 for level 1
2349 Locality = 3 - Locality;
2350 }
2351
2352 unsigned PrfOp = (IsWrite << 4) | (!IsData << 3) | (Locality << 1) | IsStream;
2353
2354 MIB.buildInstr(AArch64::G_AARCH64_PREFETCH).addImm(PrfOp).add(AddrVal);
2355 MI.eraseFromParent();
2356 return true;
2357 }
2358