1 //===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AArch64.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13
14 #include "AArch64LegalizerInfo.h"
15 #include "AArch64RegisterBankInfo.h"
16 #include "AArch64Subtarget.h"
17 #include "llvm/ADT/STLExtras.h"
18 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
19 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
20 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
21 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/Utils.h"
24 #include "llvm/CodeGen/MachineInstr.h"
25 #include "llvm/CodeGen/MachineRegisterInfo.h"
26 #include "llvm/CodeGen/TargetOpcodes.h"
27 #include "llvm/CodeGen/ValueTypes.h"
28 #include "llvm/IR/DerivedTypes.h"
29 #include "llvm/IR/Intrinsics.h"
30 #include "llvm/IR/IntrinsicsAArch64.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/MathExtras.h"
33 #include <initializer_list>
34
35 #define DEBUG_TYPE "aarch64-legalinfo"
36
37 using namespace llvm;
38 using namespace LegalizeActions;
39 using namespace LegalizeMutations;
40 using namespace LegalityPredicates;
41 using namespace MIPatternMatch;
42
AArch64LegalizerInfo(const AArch64Subtarget & ST)43 AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
44 : ST(&ST) {
45 using namespace TargetOpcode;
46 const LLT p0 = LLT::pointer(0, 64);
47 const LLT s8 = LLT::scalar(8);
48 const LLT s16 = LLT::scalar(16);
49 const LLT s32 = LLT::scalar(32);
50 const LLT s64 = LLT::scalar(64);
51 const LLT s128 = LLT::scalar(128);
52 const LLT v16s8 = LLT::fixed_vector(16, 8);
53 const LLT v8s8 = LLT::fixed_vector(8, 8);
54 const LLT v4s8 = LLT::fixed_vector(4, 8);
55 const LLT v2s8 = LLT::fixed_vector(2, 8);
56 const LLT v8s16 = LLT::fixed_vector(8, 16);
57 const LLT v4s16 = LLT::fixed_vector(4, 16);
58 const LLT v2s16 = LLT::fixed_vector(2, 16);
59 const LLT v2s32 = LLT::fixed_vector(2, 32);
60 const LLT v4s32 = LLT::fixed_vector(4, 32);
61 const LLT v2s64 = LLT::fixed_vector(2, 64);
62 const LLT v2p0 = LLT::fixed_vector(2, p0);
63
64 const LLT nxv16s8 = LLT::scalable_vector(16, s8);
65 const LLT nxv8s16 = LLT::scalable_vector(8, s16);
66 const LLT nxv4s32 = LLT::scalable_vector(4, s32);
67 const LLT nxv2s64 = LLT::scalable_vector(2, s64);
68
69 std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */
70 v16s8, v8s16, v4s32,
71 v2s64, v2p0,
72 /* End 128bit types */
73 /* Begin 64bit types */
74 v8s8, v4s16, v2s32};
75 std::initializer_list<LLT> ScalarAndPtrTypesList = {s8, s16, s32, s64, p0};
76 SmallVector<LLT, 8> PackedVectorAllTypesVec(PackedVectorAllTypeList);
77 SmallVector<LLT, 8> ScalarAndPtrTypesVec(ScalarAndPtrTypesList);
78
79 const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine();
80
81 // FIXME: support subtargets which have neon/fp-armv8 disabled.
82 if (!ST.hasNEON() || !ST.hasFPARMv8()) {
83 getLegacyLegalizerInfo().computeTables();
84 return;
85 }
86
87 // Some instructions only support s16 if the subtarget has full 16-bit FP
88 // support.
89 const bool HasFP16 = ST.hasFullFP16();
90 const LLT &MinFPScalar = HasFP16 ? s16 : s32;
91
92 const bool HasCSSC = ST.hasCSSC();
93 const bool HasRCPC3 = ST.hasRCPC3();
94
95 getActionDefinitionsBuilder(
96 {G_IMPLICIT_DEF, G_FREEZE, G_CONSTANT_FOLD_BARRIER})
97 .legalFor({p0, s8, s16, s32, s64})
98 .legalFor(PackedVectorAllTypeList)
99 .widenScalarToNextPow2(0)
100 .clampScalar(0, s8, s64)
101 .moreElementsToNextPow2(0)
102 .widenVectorEltsToVectorMinSize(0, 64)
103 .clampNumElements(0, v8s8, v16s8)
104 .clampNumElements(0, v4s16, v8s16)
105 .clampNumElements(0, v2s32, v4s32)
106 .clampNumElements(0, v2s64, v2s64);
107
108 getActionDefinitionsBuilder(G_PHI)
109 .legalFor({p0, s16, s32, s64})
110 .legalFor(PackedVectorAllTypeList)
111 .widenScalarToNextPow2(0)
112 .clampScalar(0, s16, s64)
113 // Maximum: sN * k = 128
114 .clampMaxNumElements(0, s8, 16)
115 .clampMaxNumElements(0, s16, 8)
116 .clampMaxNumElements(0, s32, 4)
117 .clampMaxNumElements(0, s64, 2)
118 .clampMaxNumElements(0, p0, 2);
119
120 getActionDefinitionsBuilder(G_BSWAP)
121 .legalFor({s32, s64, v4s16, v8s16, v2s32, v4s32, v2s64})
122 .widenScalarOrEltToNextPow2(0, 16)
123 .clampScalar(0, s32, s64)
124 .clampNumElements(0, v4s16, v8s16)
125 .clampNumElements(0, v2s32, v4s32)
126 .clampNumElements(0, v2s64, v2s64)
127 .moreElementsToNextPow2(0);
128
129 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
130 .legalFor({s32, s64, v2s32, v2s64, v4s32, v4s16, v8s16, v16s8, v8s8})
131 .widenScalarToNextPow2(0)
132 .clampScalar(0, s32, s64)
133 .clampMaxNumElements(0, s8, 16)
134 .clampMaxNumElements(0, s16, 8)
135 .clampNumElements(0, v2s32, v4s32)
136 .clampNumElements(0, v2s64, v2s64)
137 .minScalarOrEltIf(
138 [=](const LegalityQuery &Query) {
139 return Query.Types[0].getNumElements() <= 2;
140 },
141 0, s32)
142 .minScalarOrEltIf(
143 [=](const LegalityQuery &Query) {
144 return Query.Types[0].getNumElements() <= 4;
145 },
146 0, s16)
147 .minScalarOrEltIf(
148 [=](const LegalityQuery &Query) {
149 return Query.Types[0].getNumElements() <= 16;
150 },
151 0, s8)
152 .moreElementsToNextPow2(0);
153
154 getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR})
155 .customIf([=](const LegalityQuery &Query) {
156 const auto &SrcTy = Query.Types[0];
157 const auto &AmtTy = Query.Types[1];
158 return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
159 AmtTy.getSizeInBits() == 32;
160 })
161 .legalFor({
162 {s32, s32},
163 {s32, s64},
164 {s64, s64},
165 {v8s8, v8s8},
166 {v16s8, v16s8},
167 {v4s16, v4s16},
168 {v8s16, v8s16},
169 {v2s32, v2s32},
170 {v4s32, v4s32},
171 {v2s64, v2s64},
172 })
173 .widenScalarToNextPow2(0)
174 .clampScalar(1, s32, s64)
175 .clampScalar(0, s32, s64)
176 .clampNumElements(0, v8s8, v16s8)
177 .clampNumElements(0, v4s16, v8s16)
178 .clampNumElements(0, v2s32, v4s32)
179 .clampNumElements(0, v2s64, v2s64)
180 .moreElementsToNextPow2(0)
181 .minScalarSameAs(1, 0);
182
183 getActionDefinitionsBuilder(G_PTR_ADD)
184 .legalFor({{p0, s64}, {v2p0, v2s64}})
185 .clampScalarOrElt(1, s64, s64)
186 .clampNumElements(0, v2p0, v2p0);
187
188 getActionDefinitionsBuilder(G_PTRMASK).legalFor({{p0, s64}});
189
190 getActionDefinitionsBuilder({G_SDIV, G_UDIV})
191 .legalFor({s32, s64})
192 .libcallFor({s128})
193 .clampScalar(0, s32, s64)
194 .widenScalarToNextPow2(0)
195 .scalarize(0);
196
197 getActionDefinitionsBuilder({G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
198 .lowerFor({s8, s16, s32, s64, v2s64, v4s32, v2s32})
199 .widenScalarOrEltToNextPow2(0)
200 .clampScalarOrElt(0, s32, s64)
201 .clampNumElements(0, v2s32, v4s32)
202 .clampNumElements(0, v2s64, v2s64)
203 .moreElementsToNextPow2(0);
204
205
206 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
207 .widenScalarToNextPow2(0, /*Min = */ 32)
208 .clampScalar(0, s32, s64)
209 .lower();
210
211 getActionDefinitionsBuilder({G_SMULH, G_UMULH})
212 .legalFor({s64, v8s16, v16s8, v4s32})
213 .lower();
214
215 auto &MinMaxActions = getActionDefinitionsBuilder(
216 {G_SMIN, G_SMAX, G_UMIN, G_UMAX});
217 if (HasCSSC)
218 MinMaxActions
219 .legalFor({s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
220 // Making clamping conditional on CSSC extension as without legal types we
221 // lower to CMP which can fold one of the two sxtb's we'd otherwise need
222 // if we detect a type smaller than 32-bit.
223 .minScalar(0, s32);
224 else
225 MinMaxActions
226 .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32});
227 MinMaxActions
228 .clampNumElements(0, v8s8, v16s8)
229 .clampNumElements(0, v4s16, v8s16)
230 .clampNumElements(0, v2s32, v4s32)
231 // FIXME: This sholdn't be needed as v2s64 types are going to
232 // be expanded anyway, but G_ICMP doesn't support splitting vectors yet
233 .clampNumElements(0, v2s64, v2s64)
234 .lower();
235
236 getActionDefinitionsBuilder(
237 {G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO})
238 .legalFor({{s32, s32}, {s64, s32}})
239 .clampScalar(0, s32, s64)
240 .clampScalar(1, s32, s64)
241 .widenScalarToNextPow2(0);
242
243 getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FMA, G_FNEG,
244 G_FABS, G_FSQRT, G_FMAXNUM, G_FMINNUM,
245 G_FMAXIMUM, G_FMINIMUM, G_FCEIL, G_FFLOOR,
246 G_FRINT, G_FNEARBYINT, G_INTRINSIC_TRUNC,
247 G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
248 .legalFor({MinFPScalar, s32, s64, v2s32, v4s32, v2s64})
249 .legalIf([=](const LegalityQuery &Query) {
250 const auto &Ty = Query.Types[0];
251 return (Ty == v8s16 || Ty == v4s16) && HasFP16;
252 })
253 .libcallFor({s128})
254 .minScalarOrElt(0, MinFPScalar)
255 .clampNumElements(0, v4s16, v8s16)
256 .clampNumElements(0, v2s32, v4s32)
257 .clampNumElements(0, v2s64, v2s64)
258 .moreElementsToNextPow2(0);
259
260 getActionDefinitionsBuilder(G_FREM)
261 .libcallFor({s32, s64})
262 .minScalar(0, s32)
263 .scalarize(0);
264
265 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
266 .legalFor({{s64, MinFPScalar}, {s64, s32}, {s64, s64}})
267 .libcallFor({{s64, s128}})
268 .minScalarOrElt(1, MinFPScalar);
269
270 getActionDefinitionsBuilder(
271 {G_FCOS, G_FSIN, G_FPOW, G_FLOG, G_FLOG2, G_FLOG10, G_FTAN, G_FEXP,
272 G_FEXP2, G_FEXP10, G_FACOS, G_FASIN, G_FATAN, G_FCOSH, G_FSINH, G_FTANH})
273 // We need a call for these, so we always need to scalarize.
274 .scalarize(0)
275 // Regardless of FP16 support, widen 16-bit elements to 32-bits.
276 .minScalar(0, s32)
277 .libcallFor({s32, s64});
278 getActionDefinitionsBuilder(G_FPOWI)
279 .scalarize(0)
280 .minScalar(0, s32)
281 .libcallFor({{s32, s32}, {s64, s32}});
282
283 getActionDefinitionsBuilder(G_INSERT)
284 .legalIf(all(typeInSet(0, {s32, s64, p0}),
285 typeInSet(1, {s8, s16, s32}), smallerThan(1, 0)))
286 .widenScalarToNextPow2(0)
287 .clampScalar(0, s32, s64)
288 .widenScalarToNextPow2(1)
289 .minScalar(1, s8)
290 .maxScalarIf(typeInSet(0, {s32}), 1, s16)
291 .maxScalarIf(typeInSet(0, {s64, p0}), 1, s32);
292
293 getActionDefinitionsBuilder(G_EXTRACT)
294 .legalIf(all(typeInSet(0, {s16, s32, s64, p0}),
295 typeInSet(1, {s32, s64, s128, p0}), smallerThan(0, 1)))
296 .widenScalarToNextPow2(1)
297 .clampScalar(1, s32, s128)
298 .widenScalarToNextPow2(0)
299 .minScalar(0, s16)
300 .maxScalarIf(typeInSet(1, {s32}), 0, s16)
301 .maxScalarIf(typeInSet(1, {s64, p0}), 0, s32)
302 .maxScalarIf(typeInSet(1, {s128}), 0, s64);
303
304
305 for (unsigned Op : {G_SEXTLOAD, G_ZEXTLOAD}) {
306 auto &Actions = getActionDefinitionsBuilder(Op);
307
308 if (Op == G_SEXTLOAD)
309 Actions.lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered));
310
311 // Atomics have zero extending behavior.
312 Actions
313 .legalForTypesWithMemDesc({{s32, p0, s8, 8},
314 {s32, p0, s16, 8},
315 {s32, p0, s32, 8},
316 {s64, p0, s8, 2},
317 {s64, p0, s16, 2},
318 {s64, p0, s32, 4},
319 {s64, p0, s64, 8},
320 {p0, p0, s64, 8},
321 {v2s32, p0, s64, 8}})
322 .widenScalarToNextPow2(0)
323 .clampScalar(0, s32, s64)
324 // TODO: We could support sum-of-pow2's but the lowering code doesn't know
325 // how to do that yet.
326 .unsupportedIfMemSizeNotPow2()
327 // Lower anything left over into G_*EXT and G_LOAD
328 .lower();
329 }
330
331 auto IsPtrVecPred = [=](const LegalityQuery &Query) {
332 const LLT &ValTy = Query.Types[0];
333 return ValTy.isPointerVector() && ValTy.getAddressSpace() == 0;
334 };
335
336 auto &LoadActions = getActionDefinitionsBuilder(G_LOAD);
337 auto &StoreActions = getActionDefinitionsBuilder(G_STORE);
338
339 if (ST.hasSVE()) {
340 LoadActions.legalForTypesWithMemDesc({
341 // 128 bit base sizes
342 {nxv16s8, p0, nxv16s8, 8},
343 {nxv8s16, p0, nxv8s16, 8},
344 {nxv4s32, p0, nxv4s32, 8},
345 {nxv2s64, p0, nxv2s64, 8},
346 });
347
348 // TODO: Add nxv2p0. Consider bitcastIf.
349 // See #92130
350 // https://github.com/llvm/llvm-project/pull/92130#discussion_r1616888461
351 StoreActions.legalForTypesWithMemDesc({
352 // 128 bit base sizes
353 {nxv16s8, p0, nxv16s8, 8},
354 {nxv8s16, p0, nxv8s16, 8},
355 {nxv4s32, p0, nxv4s32, 8},
356 {nxv2s64, p0, nxv2s64, 8},
357 });
358 }
359
360 LoadActions
361 .customIf([=](const LegalityQuery &Query) {
362 return HasRCPC3 && Query.Types[0] == s128 &&
363 Query.MMODescrs[0].Ordering == AtomicOrdering::Acquire;
364 })
365 .customIf([=](const LegalityQuery &Query) {
366 return Query.Types[0] == s128 &&
367 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
368 })
369 .legalForTypesWithMemDesc({{s8, p0, s8, 8},
370 {s16, p0, s16, 8},
371 {s32, p0, s32, 8},
372 {s64, p0, s64, 8},
373 {p0, p0, s64, 8},
374 {s128, p0, s128, 8},
375 {v8s8, p0, s64, 8},
376 {v16s8, p0, s128, 8},
377 {v4s16, p0, s64, 8},
378 {v8s16, p0, s128, 8},
379 {v2s32, p0, s64, 8},
380 {v4s32, p0, s128, 8},
381 {v2s64, p0, s128, 8}})
382 // These extends are also legal
383 .legalForTypesWithMemDesc(
384 {{s32, p0, s8, 8}, {s32, p0, s16, 8}, {s64, p0, s32, 8}})
385 .widenScalarToNextPow2(0, /* MinSize = */ 8)
386 .clampMaxNumElements(0, s8, 16)
387 .clampMaxNumElements(0, s16, 8)
388 .clampMaxNumElements(0, s32, 4)
389 .clampMaxNumElements(0, s64, 2)
390 .clampMaxNumElements(0, p0, 2)
391 .lowerIfMemSizeNotByteSizePow2()
392 .clampScalar(0, s8, s64)
393 .narrowScalarIf(
394 [=](const LegalityQuery &Query) {
395 // Clamp extending load results to 32-bits.
396 return Query.Types[0].isScalar() &&
397 Query.Types[0] != Query.MMODescrs[0].MemoryTy &&
398 Query.Types[0].getSizeInBits() > 32;
399 },
400 changeTo(0, s32))
401 // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out
402 .bitcastIf(typeInSet(0, {v4s8}),
403 [=](const LegalityQuery &Query) {
404 const LLT VecTy = Query.Types[0];
405 return std::pair(0, LLT::scalar(VecTy.getSizeInBits()));
406 })
407 .customIf(IsPtrVecPred)
408 .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0);
409
410 StoreActions
411 .customIf([=](const LegalityQuery &Query) {
412 return HasRCPC3 && Query.Types[0] == s128 &&
413 Query.MMODescrs[0].Ordering == AtomicOrdering::Release;
414 })
415 .customIf([=](const LegalityQuery &Query) {
416 return Query.Types[0] == s128 &&
417 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
418 })
419 .legalForTypesWithMemDesc(
420 {{s8, p0, s8, 8}, {s16, p0, s8, 8}, // truncstorei8 from s16
421 {s32, p0, s8, 8}, // truncstorei8 from s32
422 {s64, p0, s8, 8}, // truncstorei8 from s64
423 {s16, p0, s16, 8}, {s32, p0, s16, 8}, // truncstorei16 from s32
424 {s64, p0, s16, 8}, // truncstorei16 from s64
425 {s32, p0, s8, 8}, {s32, p0, s16, 8}, {s32, p0, s32, 8},
426 {s64, p0, s64, 8}, {s64, p0, s32, 8}, // truncstorei32 from s64
427 {p0, p0, s64, 8}, {s128, p0, s128, 8}, {v16s8, p0, s128, 8},
428 {v8s8, p0, s64, 8}, {v4s16, p0, s64, 8}, {v8s16, p0, s128, 8},
429 {v2s32, p0, s64, 8}, {v4s32, p0, s128, 8}, {v2s64, p0, s128, 8}})
430 .clampScalar(0, s8, s64)
431 .lowerIf([=](const LegalityQuery &Query) {
432 return Query.Types[0].isScalar() &&
433 Query.Types[0] != Query.MMODescrs[0].MemoryTy;
434 })
435 // Maximum: sN * k = 128
436 .clampMaxNumElements(0, s8, 16)
437 .clampMaxNumElements(0, s16, 8)
438 .clampMaxNumElements(0, s32, 4)
439 .clampMaxNumElements(0, s64, 2)
440 .clampMaxNumElements(0, p0, 2)
441 .lowerIfMemSizeNotPow2()
442 // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out
443 .bitcastIf(typeInSet(0, {v4s8}),
444 [=](const LegalityQuery &Query) {
445 const LLT VecTy = Query.Types[0];
446 return std::pair(0, LLT::scalar(VecTy.getSizeInBits()));
447 })
448 .customIf(IsPtrVecPred)
449 .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0);
450
451 getActionDefinitionsBuilder(G_INDEXED_STORE)
452 // Idx 0 == Ptr, Idx 1 == Val
453 // TODO: we can implement legalizations but as of now these are
454 // generated in a very specific way.
455 .legalForTypesWithMemDesc({
456 {p0, s8, s8, 8},
457 {p0, s16, s16, 8},
458 {p0, s32, s8, 8},
459 {p0, s32, s16, 8},
460 {p0, s32, s32, 8},
461 {p0, s64, s64, 8},
462 {p0, p0, p0, 8},
463 {p0, v8s8, v8s8, 8},
464 {p0, v16s8, v16s8, 8},
465 {p0, v4s16, v4s16, 8},
466 {p0, v8s16, v8s16, 8},
467 {p0, v2s32, v2s32, 8},
468 {p0, v4s32, v4s32, 8},
469 {p0, v2s64, v2s64, 8},
470 {p0, v2p0, v2p0, 8},
471 {p0, s128, s128, 8},
472 })
473 .unsupported();
474
475 auto IndexedLoadBasicPred = [=](const LegalityQuery &Query) {
476 LLT LdTy = Query.Types[0];
477 LLT PtrTy = Query.Types[1];
478 if (!llvm::is_contained(PackedVectorAllTypesVec, LdTy) &&
479 !llvm::is_contained(ScalarAndPtrTypesVec, LdTy) && LdTy != s128)
480 return false;
481 if (PtrTy != p0)
482 return false;
483 return true;
484 };
485 getActionDefinitionsBuilder(G_INDEXED_LOAD)
486 .unsupportedIf(
487 atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered))
488 .legalIf(IndexedLoadBasicPred)
489 .unsupported();
490 getActionDefinitionsBuilder({G_INDEXED_SEXTLOAD, G_INDEXED_ZEXTLOAD})
491 .unsupportedIf(
492 atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered))
493 .legalIf(all(typeInSet(0, {s16, s32, s64}),
494 LegalityPredicate([=](const LegalityQuery &Q) {
495 LLT LdTy = Q.Types[0];
496 LLT PtrTy = Q.Types[1];
497 LLT MemTy = Q.MMODescrs[0].MemoryTy;
498 if (PtrTy != p0)
499 return false;
500 if (LdTy == s16)
501 return MemTy == s8;
502 if (LdTy == s32)
503 return MemTy == s8 || MemTy == s16;
504 if (LdTy == s64)
505 return MemTy == s8 || MemTy == s16 || MemTy == s32;
506 return false;
507 })))
508 .unsupported();
509
510 // Constants
511 getActionDefinitionsBuilder(G_CONSTANT)
512 .legalFor({p0, s8, s16, s32, s64})
513 .widenScalarToNextPow2(0)
514 .clampScalar(0, s8, s64);
515 getActionDefinitionsBuilder(G_FCONSTANT)
516 .legalIf([=](const LegalityQuery &Query) {
517 const auto &Ty = Query.Types[0];
518 if (HasFP16 && Ty == s16)
519 return true;
520 return Ty == s32 || Ty == s64 || Ty == s128;
521 })
522 .clampScalar(0, MinFPScalar, s128);
523
524 // FIXME: fix moreElementsToNextPow2
525 getActionDefinitionsBuilder(G_ICMP)
526 .legalFor({{s32, s32}, {s32, s64}, {s32, p0}})
527 .widenScalarOrEltToNextPow2(1)
528 .clampScalar(1, s32, s64)
529 .clampScalar(0, s32, s32)
530 .minScalarEltSameAsIf(
531 [=](const LegalityQuery &Query) {
532 const LLT &Ty = Query.Types[0];
533 const LLT &SrcTy = Query.Types[1];
534 return Ty.isVector() && !SrcTy.isPointerVector() &&
535 Ty.getElementType() != SrcTy.getElementType();
536 },
537 0, 1)
538 .minScalarOrEltIf(
539 [=](const LegalityQuery &Query) { return Query.Types[1] == v2s16; },
540 1, s32)
541 .minScalarOrEltIf(
542 [=](const LegalityQuery &Query) { return Query.Types[1] == v2p0; }, 0,
543 s64)
544 .moreElementsToNextPow2(1)
545 .clampNumElements(1, v8s8, v16s8)
546 .clampNumElements(1, v4s16, v8s16)
547 .clampNumElements(1, v2s32, v4s32)
548 .clampNumElements(1, v2s64, v2s64)
549 .customIf(isVector(0));
550
551 getActionDefinitionsBuilder(G_FCMP)
552 .legalFor({{s32, MinFPScalar},
553 {s32, s32},
554 {s32, s64},
555 {v4s32, v4s32},
556 {v2s32, v2s32},
557 {v2s64, v2s64}})
558 .legalIf([=](const LegalityQuery &Query) {
559 const auto &Ty = Query.Types[1];
560 return (Ty == v8s16 || Ty == v4s16) && Ty == Query.Types[0] && HasFP16;
561 })
562 .widenScalarOrEltToNextPow2(1)
563 .clampScalar(0, s32, s32)
564 .clampScalarOrElt(1, MinFPScalar, s64)
565 .minScalarEltSameAsIf(
566 [=](const LegalityQuery &Query) {
567 const LLT &Ty = Query.Types[0];
568 const LLT &SrcTy = Query.Types[1];
569 return Ty.isVector() && !SrcTy.isPointerVector() &&
570 Ty.getElementType() != SrcTy.getElementType();
571 },
572 0, 1)
573 .clampNumElements(1, v4s16, v8s16)
574 .clampNumElements(1, v2s32, v4s32)
575 .clampMaxNumElements(1, s64, 2)
576 .moreElementsToNextPow2(1);
577
578 // Extensions
579 auto ExtLegalFunc = [=](const LegalityQuery &Query) {
580 unsigned DstSize = Query.Types[0].getSizeInBits();
581
582 // Handle legal vectors using legalFor
583 if (Query.Types[0].isVector())
584 return false;
585
586 if (DstSize < 8 || DstSize >= 128 || !isPowerOf2_32(DstSize))
587 return false; // Extending to a scalar s128 needs narrowing.
588
589 const LLT &SrcTy = Query.Types[1];
590
591 // Make sure we fit in a register otherwise. Don't bother checking that
592 // the source type is below 128 bits. We shouldn't be allowing anything
593 // through which is wider than the destination in the first place.
594 unsigned SrcSize = SrcTy.getSizeInBits();
595 if (SrcSize < 8 || !isPowerOf2_32(SrcSize))
596 return false;
597
598 return true;
599 };
600 getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
601 .legalIf(ExtLegalFunc)
602 .legalFor({{v2s64, v2s32}, {v4s32, v4s16}, {v8s16, v8s8}})
603 .clampScalar(0, s64, s64) // Just for s128, others are handled above.
604 .moreElementsToNextPow2(0)
605 .clampMaxNumElements(1, s8, 8)
606 .clampMaxNumElements(1, s16, 4)
607 .clampMaxNumElements(1, s32, 2)
608 // Tries to convert a large EXTEND into two smaller EXTENDs
609 .lowerIf([=](const LegalityQuery &Query) {
610 return (Query.Types[0].getScalarSizeInBits() >
611 Query.Types[1].getScalarSizeInBits() * 2) &&
612 Query.Types[0].isVector() &&
613 (Query.Types[1].getScalarSizeInBits() == 8 ||
614 Query.Types[1].getScalarSizeInBits() == 16);
615 })
616 .clampMinNumElements(1, s8, 8)
617 .clampMinNumElements(1, s16, 4);
618
619 getActionDefinitionsBuilder(G_TRUNC)
620 .legalFor({{v2s32, v2s64}, {v4s16, v4s32}, {v8s8, v8s16}})
621 .moreElementsToNextPow2(0)
622 .clampMaxNumElements(0, s8, 8)
623 .clampMaxNumElements(0, s16, 4)
624 .clampMaxNumElements(0, s32, 2)
625 .minScalarOrEltIf(
626 [=](const LegalityQuery &Query) { return Query.Types[0].isVector(); },
627 0, s8)
628 .lowerIf([=](const LegalityQuery &Query) {
629 LLT DstTy = Query.Types[0];
630 LLT SrcTy = Query.Types[1];
631 return DstTy.isVector() && SrcTy.getSizeInBits() > 128 &&
632 DstTy.getScalarSizeInBits() * 2 <= SrcTy.getScalarSizeInBits();
633 })
634 .clampMinNumElements(0, s8, 8)
635 .clampMinNumElements(0, s16, 4)
636 .alwaysLegal();
637
638 getActionDefinitionsBuilder(G_SEXT_INREG)
639 .legalFor({s32, s64})
640 .legalFor(PackedVectorAllTypeList)
641 .maxScalar(0, s64)
642 .clampNumElements(0, v8s8, v16s8)
643 .clampNumElements(0, v4s16, v8s16)
644 .clampNumElements(0, v2s32, v4s32)
645 .clampMaxNumElements(0, s64, 2)
646 .lower();
647
648 // FP conversions
649 getActionDefinitionsBuilder(G_FPTRUNC)
650 .legalFor(
651 {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}})
652 .libcallFor({{s16, s128}, {s32, s128}, {s64, s128}})
653 .clampNumElements(0, v4s16, v4s16)
654 .clampNumElements(0, v2s32, v2s32)
655 .scalarize(0);
656
657 getActionDefinitionsBuilder(G_FPEXT)
658 .legalFor(
659 {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}})
660 .libcallFor({{s128, s64}, {s128, s32}, {s128, s16}})
661 .clampNumElements(0, v4s32, v4s32)
662 .clampNumElements(0, v2s64, v2s64)
663 .scalarize(0);
664
665 // Conversions
666 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
667 .legalFor({{s32, s32},
668 {s64, s32},
669 {s32, s64},
670 {s64, s64},
671 {v2s64, v2s64},
672 {v4s32, v4s32},
673 {v2s32, v2s32}})
674 .legalIf([=](const LegalityQuery &Query) {
675 return HasFP16 &&
676 (Query.Types[1] == s16 || Query.Types[1] == v4s16 ||
677 Query.Types[1] == v8s16) &&
678 (Query.Types[0] == s32 || Query.Types[0] == s64 ||
679 Query.Types[0] == v4s16 || Query.Types[0] == v8s16);
680 })
681 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
682 .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
683 // The range of a fp16 value fits into an i17, so we can lower the width
684 // to i64.
685 .narrowScalarIf(
686 [=](const LegalityQuery &Query) {
687 return Query.Types[1] == s16 && Query.Types[0].getSizeInBits() > 64;
688 },
689 changeTo(0, s64))
690 .moreElementsToNextPow2(0)
691 .widenScalarOrEltToNextPow2OrMinSize(0)
692 .minScalar(0, s32)
693 .widenScalarOrEltToNextPow2OrMinSize(1, /*MinSize=*/HasFP16 ? 16 : 32)
694 .widenScalarIf(
695 [=](const LegalityQuery &Query) {
696 return Query.Types[0].getScalarSizeInBits() <= 64 &&
697 Query.Types[0].getScalarSizeInBits() >
698 Query.Types[1].getScalarSizeInBits();
699 },
700 LegalizeMutations::changeElementSizeTo(1, 0))
701 .widenScalarIf(
702 [=](const LegalityQuery &Query) {
703 return Query.Types[1].getScalarSizeInBits() <= 64 &&
704 Query.Types[0].getScalarSizeInBits() <
705 Query.Types[1].getScalarSizeInBits();
706 },
707 LegalizeMutations::changeElementSizeTo(0, 1))
708 .clampNumElements(0, v4s16, v8s16)
709 .clampNumElements(0, v2s32, v4s32)
710 .clampMaxNumElements(0, s64, 2)
711 .libcallFor(
712 {{s32, s128}, {s64, s128}, {s128, s128}, {s128, s32}, {s128, s64}});
713
714 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
715 .legalFor({{s32, s32},
716 {s64, s32},
717 {s32, s64},
718 {s64, s64},
719 {v2s64, v2s64},
720 {v4s32, v4s32},
721 {v2s32, v2s32}})
722 .legalIf([=](const LegalityQuery &Query) {
723 return HasFP16 &&
724 (Query.Types[0] == s16 || Query.Types[0] == v4s16 ||
725 Query.Types[0] == v8s16) &&
726 (Query.Types[1] == s32 || Query.Types[1] == s64 ||
727 Query.Types[1] == v4s16 || Query.Types[1] == v8s16);
728 })
729 .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
730 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
731 .moreElementsToNextPow2(1)
732 .widenScalarOrEltToNextPow2OrMinSize(1)
733 .minScalar(1, s32)
734 .widenScalarOrEltToNextPow2OrMinSize(0, /*MinSize=*/HasFP16 ? 16 : 32)
735 .widenScalarIf(
736 [=](const LegalityQuery &Query) {
737 return Query.Types[1].getScalarSizeInBits() <= 64 &&
738 Query.Types[0].getScalarSizeInBits() <
739 Query.Types[1].getScalarSizeInBits();
740 },
741 LegalizeMutations::changeElementSizeTo(0, 1))
742 .widenScalarIf(
743 [=](const LegalityQuery &Query) {
744 return Query.Types[0].getScalarSizeInBits() <= 64 &&
745 Query.Types[0].getScalarSizeInBits() >
746 Query.Types[1].getScalarSizeInBits();
747 },
748 LegalizeMutations::changeElementSizeTo(1, 0))
749 .clampNumElements(0, v4s16, v8s16)
750 .clampNumElements(0, v2s32, v4s32)
751 .clampMaxNumElements(0, s64, 2)
752 .libcallFor({{s16, s128},
753 {s32, s128},
754 {s64, s128},
755 {s128, s128},
756 {s128, s32},
757 {s128, s64}});
758
759 // Control-flow
760 getActionDefinitionsBuilder(G_BRCOND)
761 .legalFor({s32})
762 .clampScalar(0, s32, s32);
763 getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0});
764
765 getActionDefinitionsBuilder(G_SELECT)
766 .legalFor({{s32, s32}, {s64, s32}, {p0, s32}})
767 .widenScalarToNextPow2(0)
768 .clampScalar(0, s32, s64)
769 .clampScalar(1, s32, s32)
770 .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0)
771 .lowerIf(isVector(0));
772
773 // Pointer-handling
774 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
775
776 if (TM.getCodeModel() == CodeModel::Small)
777 getActionDefinitionsBuilder(G_GLOBAL_VALUE).custom();
778 else
779 getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
780
781 getActionDefinitionsBuilder(G_PTRAUTH_GLOBAL_VALUE)
782 .legalIf(all(typeIs(0, p0), typeIs(1, p0)));
783
784 getActionDefinitionsBuilder(G_PTRTOINT)
785 .legalFor({{s64, p0}, {v2s64, v2p0}})
786 .widenScalarToNextPow2(0, 64)
787 .clampScalar(0, s64, s64);
788
789 getActionDefinitionsBuilder(G_INTTOPTR)
790 .unsupportedIf([&](const LegalityQuery &Query) {
791 return Query.Types[0].getSizeInBits() != Query.Types[1].getSizeInBits();
792 })
793 .legalFor({{p0, s64}, {v2p0, v2s64}});
794
795 // Casts for 32 and 64-bit width type are just copies.
796 // Same for 128-bit width type, except they are on the FPR bank.
797 getActionDefinitionsBuilder(G_BITCAST)
798 // Keeping 32-bit instructions legal to prevent regression in some tests
799 .legalForCartesianProduct({s32, v2s16, v4s8})
800 .legalForCartesianProduct({s64, v8s8, v4s16, v2s32})
801 .legalForCartesianProduct({s128, v16s8, v8s16, v4s32, v2s64, v2p0})
802 .lowerIf([=](const LegalityQuery &Query) {
803 return Query.Types[0].isVector() != Query.Types[1].isVector();
804 })
805 .moreElementsToNextPow2(0)
806 .clampNumElements(0, v8s8, v16s8)
807 .clampNumElements(0, v4s16, v8s16)
808 .clampNumElements(0, v2s32, v4s32)
809 .lower();
810
811 getActionDefinitionsBuilder(G_VASTART).legalFor({p0});
812
813 // va_list must be a pointer, but most sized types are pretty easy to handle
814 // as the destination.
815 getActionDefinitionsBuilder(G_VAARG)
816 .customForCartesianProduct({s8, s16, s32, s64, p0}, {p0})
817 .clampScalar(0, s8, s64)
818 .widenScalarToNextPow2(0, /*Min*/ 8);
819
820 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
821 .lowerIf(
822 all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0)));
823
824 LegalityPredicate UseOutlineAtomics = [&ST](const LegalityQuery &Query) {
825 return ST.outlineAtomics() && !ST.hasLSE();
826 };
827
828 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
829 .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0),
830 predNot(UseOutlineAtomics)))
831 .customIf(all(typeIs(0, s128), predNot(UseOutlineAtomics)))
832 .customIf([UseOutlineAtomics](const LegalityQuery &Query) {
833 return Query.Types[0].getSizeInBits() == 128 &&
834 !UseOutlineAtomics(Query);
835 })
836 .libcallIf(all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(1, p0),
837 UseOutlineAtomics))
838 .clampScalar(0, s32, s64);
839
840 getActionDefinitionsBuilder({G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD,
841 G_ATOMICRMW_SUB, G_ATOMICRMW_AND, G_ATOMICRMW_OR,
842 G_ATOMICRMW_XOR})
843 .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0),
844 predNot(UseOutlineAtomics)))
845 .libcallIf(all(typeInSet(0, {s8, s16, s32, s64}), typeIs(1, p0),
846 UseOutlineAtomics))
847 .clampScalar(0, s32, s64);
848
849 // Do not outline these atomics operations, as per comment in
850 // AArch64ISelLowering.cpp's shouldExpandAtomicRMWInIR().
851 getActionDefinitionsBuilder(
852 {G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX})
853 .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)))
854 .clampScalar(0, s32, s64);
855
856 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({p0});
857
858 // Merge/Unmerge
859 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
860 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
861 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
862 getActionDefinitionsBuilder(Op)
863 .widenScalarToNextPow2(LitTyIdx, 8)
864 .widenScalarToNextPow2(BigTyIdx, 32)
865 .clampScalar(LitTyIdx, s8, s64)
866 .clampScalar(BigTyIdx, s32, s128)
867 .legalIf([=](const LegalityQuery &Q) {
868 switch (Q.Types[BigTyIdx].getSizeInBits()) {
869 case 32:
870 case 64:
871 case 128:
872 break;
873 default:
874 return false;
875 }
876 switch (Q.Types[LitTyIdx].getSizeInBits()) {
877 case 8:
878 case 16:
879 case 32:
880 case 64:
881 return true;
882 default:
883 return false;
884 }
885 });
886 }
887
888 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
889 .unsupportedIf([=](const LegalityQuery &Query) {
890 const LLT &EltTy = Query.Types[1].getElementType();
891 return Query.Types[0] != EltTy;
892 })
893 .minScalar(2, s64)
894 .customIf([=](const LegalityQuery &Query) {
895 const LLT &VecTy = Query.Types[1];
896 return VecTy == v2s16 || VecTy == v4s16 || VecTy == v8s16 ||
897 VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32 ||
898 VecTy == v8s8 || VecTy == v16s8 || VecTy == v2p0;
899 })
900 .minScalarOrEltIf(
901 [=](const LegalityQuery &Query) {
902 // We want to promote to <M x s1> to <M x s64> if that wouldn't
903 // cause the total vec size to be > 128b.
904 return Query.Types[1].getNumElements() <= 2;
905 },
906 0, s64)
907 .minScalarOrEltIf(
908 [=](const LegalityQuery &Query) {
909 return Query.Types[1].getNumElements() <= 4;
910 },
911 0, s32)
912 .minScalarOrEltIf(
913 [=](const LegalityQuery &Query) {
914 return Query.Types[1].getNumElements() <= 8;
915 },
916 0, s16)
917 .minScalarOrEltIf(
918 [=](const LegalityQuery &Query) {
919 return Query.Types[1].getNumElements() <= 16;
920 },
921 0, s8)
922 .minScalarOrElt(0, s8) // Worst case, we need at least s8.
923 .moreElementsToNextPow2(1)
924 .clampMaxNumElements(1, s64, 2)
925 .clampMaxNumElements(1, s32, 4)
926 .clampMaxNumElements(1, s16, 8)
927 .clampMaxNumElements(1, s8, 16)
928 .clampMaxNumElements(1, p0, 2);
929
930 getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT)
931 .legalIf(
932 typeInSet(0, {v16s8, v8s8, v8s16, v4s16, v4s32, v2s32, v2s64, v2p0}))
933 .moreElementsToNextPow2(0)
934 .widenVectorEltsToVectorMinSize(0, 64)
935 .clampNumElements(0, v8s8, v16s8)
936 .clampNumElements(0, v4s16, v8s16)
937 .clampNumElements(0, v2s32, v4s32)
938 .clampMaxNumElements(0, s64, 2)
939 .clampMaxNumElements(0, p0, 2);
940
941 getActionDefinitionsBuilder(G_BUILD_VECTOR)
942 .legalFor({{v8s8, s8},
943 {v16s8, s8},
944 {v4s16, s16},
945 {v8s16, s16},
946 {v2s32, s32},
947 {v4s32, s32},
948 {v2p0, p0},
949 {v2s64, s64}})
950 .clampNumElements(0, v4s32, v4s32)
951 .clampNumElements(0, v2s64, v2s64)
952 .minScalarOrElt(0, s8)
953 .widenVectorEltsToVectorMinSize(0, 64)
954 .minScalarSameAs(1, 0);
955
956 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC).lower();
957
958 getActionDefinitionsBuilder(G_CTLZ)
959 .legalForCartesianProduct(
960 {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
961 .scalarize(1)
962 .widenScalarToNextPow2(1, /*Min=*/32)
963 .clampScalar(1, s32, s64)
964 .scalarSameSizeAs(0, 1);
965 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).lower();
966
967 // TODO: Custom lowering for v2s32, v4s32, v2s64.
968 getActionDefinitionsBuilder(G_BITREVERSE)
969 .legalFor({s32, s64, v8s8, v16s8})
970 .widenScalarToNextPow2(0, /*Min = */ 32)
971 .clampScalar(0, s32, s64);
972
973 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF).lower();
974
975 getActionDefinitionsBuilder(G_CTTZ)
976 .lowerIf(isVector(0))
977 .widenScalarToNextPow2(1, /*Min=*/32)
978 .clampScalar(1, s32, s64)
979 .scalarSameSizeAs(0, 1)
980 .legalIf([=](const LegalityQuery &Query) {
981 return (HasCSSC && typeInSet(0, {s32, s64})(Query));
982 })
983 .customIf([=](const LegalityQuery &Query) {
984 return (!HasCSSC && typeInSet(0, {s32, s64})(Query));
985 });
986
987 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
988 .legalIf([=](const LegalityQuery &Query) {
989 const LLT &DstTy = Query.Types[0];
990 const LLT &SrcTy = Query.Types[1];
991 // For now just support the TBL2 variant which needs the source vectors
992 // to be the same size as the dest.
993 if (DstTy != SrcTy)
994 return false;
995 return llvm::is_contained(
996 {v2s64, v2p0, v2s32, v4s32, v4s16, v16s8, v8s8, v8s16}, DstTy);
997 })
998 // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we
999 // just want those lowered into G_BUILD_VECTOR
1000 .lowerIf([=](const LegalityQuery &Query) {
1001 return !Query.Types[1].isVector();
1002 })
1003 .moreElementsIf(
1004 [](const LegalityQuery &Query) {
1005 return Query.Types[0].isVector() && Query.Types[1].isVector() &&
1006 Query.Types[0].getNumElements() >
1007 Query.Types[1].getNumElements();
1008 },
1009 changeTo(1, 0))
1010 .moreElementsToNextPow2(0)
1011 .moreElementsIf(
1012 [](const LegalityQuery &Query) {
1013 return Query.Types[0].isVector() && Query.Types[1].isVector() &&
1014 Query.Types[0].getNumElements() <
1015 Query.Types[1].getNumElements();
1016 },
1017 changeTo(0, 1))
1018 .widenScalarOrEltToNextPow2OrMinSize(0, 8)
1019 .clampNumElements(0, v8s8, v16s8)
1020 .clampNumElements(0, v4s16, v8s16)
1021 .clampNumElements(0, v4s32, v4s32)
1022 .clampNumElements(0, v2s64, v2s64);
1023
1024 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1025 .legalFor({{v4s32, v2s32}, {v8s16, v4s16}, {v16s8, v8s8}})
1026 .bitcastIf(
1027 [=](const LegalityQuery &Query) {
1028 return Query.Types[0].getSizeInBits() <= 128 &&
1029 Query.Types[1].getSizeInBits() <= 64;
1030 },
1031 [=](const LegalityQuery &Query) {
1032 const LLT DstTy = Query.Types[0];
1033 const LLT SrcTy = Query.Types[1];
1034 return std::pair(
1035 0, DstTy.changeElementSize(SrcTy.getSizeInBits())
1036 .changeElementCount(
1037 DstTy.getElementCount().divideCoefficientBy(
1038 SrcTy.getNumElements())));
1039 });
1040
1041 getActionDefinitionsBuilder(G_JUMP_TABLE).legalFor({p0});
1042
1043 getActionDefinitionsBuilder(G_BRJT).legalFor({{p0, s64}});
1044
1045 getActionDefinitionsBuilder(G_DYN_STACKALLOC).custom();
1046
1047 getActionDefinitionsBuilder({G_STACKSAVE, G_STACKRESTORE}).lower();
1048
1049 if (ST.hasMOPS()) {
1050 // G_BZERO is not supported. Currently it is only emitted by
1051 // PreLegalizerCombiner for G_MEMSET with zero constant.
1052 getActionDefinitionsBuilder(G_BZERO).unsupported();
1053
1054 getActionDefinitionsBuilder(G_MEMSET)
1055 .legalForCartesianProduct({p0}, {s64}, {s64})
1056 .customForCartesianProduct({p0}, {s8}, {s64})
1057 .immIdx(0); // Inform verifier imm idx 0 is handled.
1058
1059 getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE})
1060 .legalForCartesianProduct({p0}, {p0}, {s64})
1061 .immIdx(0); // Inform verifier imm idx 0 is handled.
1062
1063 // G_MEMCPY_INLINE does not have a tailcall immediate
1064 getActionDefinitionsBuilder(G_MEMCPY_INLINE)
1065 .legalForCartesianProduct({p0}, {p0}, {s64});
1066
1067 } else {
1068 getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET})
1069 .libcall();
1070 }
1071
1072 // FIXME: Legal vector types are only legal with NEON.
1073 auto &ABSActions = getActionDefinitionsBuilder(G_ABS);
1074 if (HasCSSC)
1075 ABSActions
1076 .legalFor({s32, s64});
1077 ABSActions.legalFor(PackedVectorAllTypeList)
1078 .customIf([=](const LegalityQuery &Q) {
1079 // TODO: Fix suboptimal codegen for 128+ bit types.
1080 LLT SrcTy = Q.Types[0];
1081 return SrcTy.isScalar() && SrcTy.getSizeInBits() < 128;
1082 })
1083 .widenScalarIf(
1084 [=](const LegalityQuery &Query) { return Query.Types[0] == v4s8; },
1085 [=](const LegalityQuery &Query) { return std::make_pair(0, v4s16); })
1086 .widenScalarIf(
1087 [=](const LegalityQuery &Query) { return Query.Types[0] == v2s16; },
1088 [=](const LegalityQuery &Query) { return std::make_pair(0, v2s32); })
1089 .clampNumElements(0, v8s8, v16s8)
1090 .clampNumElements(0, v4s16, v8s16)
1091 .clampNumElements(0, v2s32, v4s32)
1092 .clampNumElements(0, v2s64, v2s64)
1093 .moreElementsToNextPow2(0)
1094 .lower();
1095
1096 // For fadd reductions we have pairwise operations available. We treat the
1097 // usual legal types as legal and handle the lowering to pairwise instructions
1098 // later.
1099 getActionDefinitionsBuilder(G_VECREDUCE_FADD)
1100 .legalFor({{s32, v2s32}, {s32, v4s32}, {s64, v2s64}})
1101 .legalIf([=](const LegalityQuery &Query) {
1102 const auto &Ty = Query.Types[1];
1103 return (Ty == v4s16 || Ty == v8s16) && HasFP16;
1104 })
1105 .minScalarOrElt(0, MinFPScalar)
1106 .clampMaxNumElements(1, s64, 2)
1107 .clampMaxNumElements(1, s32, 4)
1108 .clampMaxNumElements(1, s16, 8)
1109 .lower();
1110
1111 // For fmul reductions we need to split up into individual operations. We
1112 // clamp to 128 bit vectors then to 64bit vectors to produce a cascade of
1113 // smaller types, followed by scalarizing what remains.
1114 getActionDefinitionsBuilder(G_VECREDUCE_FMUL)
1115 .minScalarOrElt(0, MinFPScalar)
1116 .clampMaxNumElements(1, s64, 2)
1117 .clampMaxNumElements(1, s32, 4)
1118 .clampMaxNumElements(1, s16, 8)
1119 .clampMaxNumElements(1, s32, 2)
1120 .clampMaxNumElements(1, s16, 4)
1121 .scalarize(1)
1122 .lower();
1123
1124 getActionDefinitionsBuilder({G_VECREDUCE_SEQ_FADD, G_VECREDUCE_SEQ_FMUL})
1125 .scalarize(2)
1126 .lower();
1127
1128 getActionDefinitionsBuilder(G_VECREDUCE_ADD)
1129 .legalFor({{s8, v16s8},
1130 {s8, v8s8},
1131 {s16, v8s16},
1132 {s16, v4s16},
1133 {s32, v4s32},
1134 {s32, v2s32},
1135 {s64, v2s64}})
1136 .clampMaxNumElements(1, s64, 2)
1137 .clampMaxNumElements(1, s32, 4)
1138 .clampMaxNumElements(1, s16, 8)
1139 .clampMaxNumElements(1, s8, 16)
1140 .lower();
1141
1142 getActionDefinitionsBuilder({G_VECREDUCE_FMIN, G_VECREDUCE_FMAX,
1143 G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM})
1144 .legalFor({{s32, v4s32}, {s32, v2s32}, {s64, v2s64}})
1145 .legalIf([=](const LegalityQuery &Query) {
1146 const auto &Ty = Query.Types[1];
1147 return Query.Types[0] == s16 && (Ty == v8s16 || Ty == v4s16) && HasFP16;
1148 })
1149 .minScalarOrElt(0, MinFPScalar)
1150 .clampMaxNumElements(1, s64, 2)
1151 .clampMaxNumElements(1, s32, 4)
1152 .clampMaxNumElements(1, s16, 8)
1153 .lower();
1154
1155 getActionDefinitionsBuilder(G_VECREDUCE_MUL)
1156 .clampMaxNumElements(1, s32, 2)
1157 .clampMaxNumElements(1, s16, 4)
1158 .clampMaxNumElements(1, s8, 8)
1159 .scalarize(1)
1160 .lower();
1161
1162 getActionDefinitionsBuilder(
1163 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX})
1164 .legalFor({{s8, v8s8},
1165 {s8, v16s8},
1166 {s16, v4s16},
1167 {s16, v8s16},
1168 {s32, v2s32},
1169 {s32, v4s32}})
1170 .moreElementsIf(
1171 [=](const LegalityQuery &Query) {
1172 return Query.Types[1].isVector() &&
1173 Query.Types[1].getElementType() != s8 &&
1174 Query.Types[1].getNumElements() & 1;
1175 },
1176 LegalizeMutations::moreElementsToNextPow2(1))
1177 .clampMaxNumElements(1, s64, 2)
1178 .clampMaxNumElements(1, s32, 4)
1179 .clampMaxNumElements(1, s16, 8)
1180 .clampMaxNumElements(1, s8, 16)
1181 .scalarize(1)
1182 .lower();
1183
1184 getActionDefinitionsBuilder(
1185 {G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
1186 // Try to break down into smaller vectors as long as they're at least 64
1187 // bits. This lets us use vector operations for some parts of the
1188 // reduction.
1189 .fewerElementsIf(
1190 [=](const LegalityQuery &Q) {
1191 LLT SrcTy = Q.Types[1];
1192 if (SrcTy.isScalar())
1193 return false;
1194 if (!isPowerOf2_32(SrcTy.getNumElements()))
1195 return false;
1196 // We can usually perform 64b vector operations.
1197 return SrcTy.getSizeInBits() > 64;
1198 },
1199 [=](const LegalityQuery &Q) {
1200 LLT SrcTy = Q.Types[1];
1201 return std::make_pair(1, SrcTy.divide(2));
1202 })
1203 .scalarize(1)
1204 .lower();
1205
1206 // TODO: Update this to correct handling when adding AArch64/SVE support.
1207 getActionDefinitionsBuilder(G_VECTOR_COMPRESS).lower();
1208
1209 getActionDefinitionsBuilder({G_FSHL, G_FSHR})
1210 .customFor({{s32, s32}, {s32, s64}, {s64, s64}})
1211 .lower();
1212
1213 getActionDefinitionsBuilder(G_ROTR)
1214 .legalFor({{s32, s64}, {s64, s64}})
1215 .customIf([=](const LegalityQuery &Q) {
1216 return Q.Types[0].isScalar() && Q.Types[1].getScalarSizeInBits() < 64;
1217 })
1218 .lower();
1219 getActionDefinitionsBuilder(G_ROTL).lower();
1220
1221 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1222 .customFor({{s32, s32}, {s64, s64}});
1223
1224 auto always = [=](const LegalityQuery &Q) { return true; };
1225 auto &CTPOPActions = getActionDefinitionsBuilder(G_CTPOP);
1226 if (HasCSSC)
1227 CTPOPActions
1228 .legalFor({{s32, s32},
1229 {s64, s64},
1230 {v8s8, v8s8},
1231 {v16s8, v16s8}})
1232 .customFor({{s128, s128},
1233 {v2s64, v2s64},
1234 {v2s32, v2s32},
1235 {v4s32, v4s32},
1236 {v4s16, v4s16},
1237 {v8s16, v8s16}});
1238 else
1239 CTPOPActions
1240 .legalFor({{v8s8, v8s8},
1241 {v16s8, v16s8}})
1242 .customFor({{s32, s32},
1243 {s64, s64},
1244 {s128, s128},
1245 {v2s64, v2s64},
1246 {v2s32, v2s32},
1247 {v4s32, v4s32},
1248 {v4s16, v4s16},
1249 {v8s16, v8s16}});
1250 CTPOPActions
1251 .clampScalar(0, s32, s128)
1252 .widenScalarToNextPow2(0)
1253 .minScalarEltSameAsIf(always, 1, 0)
1254 .maxScalarEltSameAsIf(always, 1, 0);
1255
1256 getActionDefinitionsBuilder({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT})
1257 .legalFor({v2s64, v2s32, v4s32, v4s16, v8s16, v8s8, v16s8})
1258 .clampNumElements(0, v8s8, v16s8)
1259 .clampNumElements(0, v4s16, v8s16)
1260 .clampNumElements(0, v2s32, v4s32)
1261 .clampMaxNumElements(0, s64, 2)
1262 .moreElementsToNextPow2(0)
1263 .lower();
1264
1265 // TODO: Libcall support for s128.
1266 // TODO: s16 should be legal with full FP16 support.
1267 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1268 .legalFor({{s64, s32}, {s64, s64}});
1269
1270 // TODO: Custom legalization for mismatched types.
1271 getActionDefinitionsBuilder(G_FCOPYSIGN)
1272 .moreElementsIf(
1273 [](const LegalityQuery &Query) { return Query.Types[0].isScalar(); },
1274 [=](const LegalityQuery &Query) {
1275 const LLT Ty = Query.Types[0];
1276 return std::pair(0, LLT::fixed_vector(Ty == s16 ? 4 : 2, Ty));
1277 })
1278 .lower();
1279
1280 getActionDefinitionsBuilder(G_FMAD).lower();
1281
1282 // Access to floating-point environment.
1283 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV, G_RESET_FPENV,
1284 G_GET_FPMODE, G_SET_FPMODE, G_RESET_FPMODE})
1285 .libcall();
1286
1287 getActionDefinitionsBuilder(G_IS_FPCLASS).lower();
1288
1289 getActionDefinitionsBuilder(G_PREFETCH).custom();
1290
1291 getActionDefinitionsBuilder({G_SCMP, G_UCMP}).lower();
1292
1293 getLegacyLegalizerInfo().computeTables();
1294 verify(*ST.getInstrInfo());
1295 }
1296
legalizeCustom(LegalizerHelper & Helper,MachineInstr & MI,LostDebugLocObserver & LocObserver) const1297 bool AArch64LegalizerInfo::legalizeCustom(
1298 LegalizerHelper &Helper, MachineInstr &MI,
1299 LostDebugLocObserver &LocObserver) const {
1300 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1301 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
1302 GISelChangeObserver &Observer = Helper.Observer;
1303 switch (MI.getOpcode()) {
1304 default:
1305 // No idea what to do.
1306 return false;
1307 case TargetOpcode::G_VAARG:
1308 return legalizeVaArg(MI, MRI, MIRBuilder);
1309 case TargetOpcode::G_LOAD:
1310 case TargetOpcode::G_STORE:
1311 return legalizeLoadStore(MI, MRI, MIRBuilder, Observer);
1312 case TargetOpcode::G_SHL:
1313 case TargetOpcode::G_ASHR:
1314 case TargetOpcode::G_LSHR:
1315 return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer);
1316 case TargetOpcode::G_GLOBAL_VALUE:
1317 return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer);
1318 case TargetOpcode::G_SBFX:
1319 case TargetOpcode::G_UBFX:
1320 return legalizeBitfieldExtract(MI, MRI, Helper);
1321 case TargetOpcode::G_FSHL:
1322 case TargetOpcode::G_FSHR:
1323 return legalizeFunnelShift(MI, MRI, MIRBuilder, Observer, Helper);
1324 case TargetOpcode::G_ROTR:
1325 return legalizeRotate(MI, MRI, Helper);
1326 case TargetOpcode::G_CTPOP:
1327 return legalizeCTPOP(MI, MRI, Helper);
1328 case TargetOpcode::G_ATOMIC_CMPXCHG:
1329 return legalizeAtomicCmpxchg128(MI, MRI, Helper);
1330 case TargetOpcode::G_CTTZ:
1331 return legalizeCTTZ(MI, Helper);
1332 case TargetOpcode::G_BZERO:
1333 case TargetOpcode::G_MEMCPY:
1334 case TargetOpcode::G_MEMMOVE:
1335 case TargetOpcode::G_MEMSET:
1336 return legalizeMemOps(MI, Helper);
1337 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1338 return legalizeExtractVectorElt(MI, MRI, Helper);
1339 case TargetOpcode::G_DYN_STACKALLOC:
1340 return legalizeDynStackAlloc(MI, Helper);
1341 case TargetOpcode::G_PREFETCH:
1342 return legalizePrefetch(MI, Helper);
1343 case TargetOpcode::G_ABS:
1344 return Helper.lowerAbsToCNeg(MI);
1345 case TargetOpcode::G_ICMP:
1346 return legalizeICMP(MI, MRI, MIRBuilder);
1347 }
1348
1349 llvm_unreachable("expected switch to return");
1350 }
1351
legalizeFunnelShift(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & MIRBuilder,GISelChangeObserver & Observer,LegalizerHelper & Helper) const1352 bool AArch64LegalizerInfo::legalizeFunnelShift(MachineInstr &MI,
1353 MachineRegisterInfo &MRI,
1354 MachineIRBuilder &MIRBuilder,
1355 GISelChangeObserver &Observer,
1356 LegalizerHelper &Helper) const {
1357 assert(MI.getOpcode() == TargetOpcode::G_FSHL ||
1358 MI.getOpcode() == TargetOpcode::G_FSHR);
1359
1360 // Keep as G_FSHR if shift amount is a G_CONSTANT, else use generic
1361 // lowering
1362 Register ShiftNo = MI.getOperand(3).getReg();
1363 LLT ShiftTy = MRI.getType(ShiftNo);
1364 auto VRegAndVal = getIConstantVRegValWithLookThrough(ShiftNo, MRI);
1365
1366 // Adjust shift amount according to Opcode (FSHL/FSHR)
1367 // Convert FSHL to FSHR
1368 LLT OperationTy = MRI.getType(MI.getOperand(0).getReg());
1369 APInt BitWidth(ShiftTy.getSizeInBits(), OperationTy.getSizeInBits(), false);
1370
1371 // Lower non-constant shifts and leave zero shifts to the optimizer.
1372 if (!VRegAndVal || VRegAndVal->Value.urem(BitWidth) == 0)
1373 return (Helper.lowerFunnelShiftAsShifts(MI) ==
1374 LegalizerHelper::LegalizeResult::Legalized);
1375
1376 APInt Amount = VRegAndVal->Value.urem(BitWidth);
1377
1378 Amount = MI.getOpcode() == TargetOpcode::G_FSHL ? BitWidth - Amount : Amount;
1379
1380 // If the instruction is G_FSHR, has a 64-bit G_CONSTANT for shift amount
1381 // in the range of 0 <-> BitWidth, it is legal
1382 if (ShiftTy.getSizeInBits() == 64 && MI.getOpcode() == TargetOpcode::G_FSHR &&
1383 VRegAndVal->Value.ult(BitWidth))
1384 return true;
1385
1386 // Cast the ShiftNumber to a 64-bit type
1387 auto Cast64 = MIRBuilder.buildConstant(LLT::scalar(64), Amount.zext(64));
1388
1389 if (MI.getOpcode() == TargetOpcode::G_FSHR) {
1390 Observer.changingInstr(MI);
1391 MI.getOperand(3).setReg(Cast64.getReg(0));
1392 Observer.changedInstr(MI);
1393 }
1394 // If Opcode is FSHL, remove the FSHL instruction and create a FSHR
1395 // instruction
1396 else if (MI.getOpcode() == TargetOpcode::G_FSHL) {
1397 MIRBuilder.buildInstr(TargetOpcode::G_FSHR, {MI.getOperand(0).getReg()},
1398 {MI.getOperand(1).getReg(), MI.getOperand(2).getReg(),
1399 Cast64.getReg(0)});
1400 MI.eraseFromParent();
1401 }
1402 return true;
1403 }
1404
legalizeICMP(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & MIRBuilder) const1405 bool AArch64LegalizerInfo::legalizeICMP(MachineInstr &MI,
1406 MachineRegisterInfo &MRI,
1407 MachineIRBuilder &MIRBuilder) const {
1408 Register DstReg = MI.getOperand(0).getReg();
1409 Register SrcReg1 = MI.getOperand(2).getReg();
1410 Register SrcReg2 = MI.getOperand(3).getReg();
1411 LLT DstTy = MRI.getType(DstReg);
1412 LLT SrcTy = MRI.getType(SrcReg1);
1413
1414 // Check the vector types are legal
1415 if (DstTy.getScalarSizeInBits() != SrcTy.getScalarSizeInBits() ||
1416 DstTy.getNumElements() != SrcTy.getNumElements() ||
1417 (DstTy.getSizeInBits() != 64 && DstTy.getSizeInBits() != 128))
1418 return false;
1419
1420 // Lowers G_ICMP NE => G_ICMP EQ to allow better pattern matching for
1421 // following passes
1422 CmpInst::Predicate Pred = (CmpInst::Predicate)MI.getOperand(1).getPredicate();
1423 if (Pred != CmpInst::ICMP_NE)
1424 return true;
1425 Register CmpReg =
1426 MIRBuilder
1427 .buildICmp(CmpInst::ICMP_EQ, MRI.getType(DstReg), SrcReg1, SrcReg2)
1428 .getReg(0);
1429 MIRBuilder.buildNot(DstReg, CmpReg);
1430
1431 MI.eraseFromParent();
1432 return true;
1433 }
1434
legalizeRotate(MachineInstr & MI,MachineRegisterInfo & MRI,LegalizerHelper & Helper) const1435 bool AArch64LegalizerInfo::legalizeRotate(MachineInstr &MI,
1436 MachineRegisterInfo &MRI,
1437 LegalizerHelper &Helper) const {
1438 // To allow for imported patterns to match, we ensure that the rotate amount
1439 // is 64b with an extension.
1440 Register AmtReg = MI.getOperand(2).getReg();
1441 LLT AmtTy = MRI.getType(AmtReg);
1442 (void)AmtTy;
1443 assert(AmtTy.isScalar() && "Expected a scalar rotate");
1444 assert(AmtTy.getSizeInBits() < 64 && "Expected this rotate to be legal");
1445 auto NewAmt = Helper.MIRBuilder.buildZExt(LLT::scalar(64), AmtReg);
1446 Helper.Observer.changingInstr(MI);
1447 MI.getOperand(2).setReg(NewAmt.getReg(0));
1448 Helper.Observer.changedInstr(MI);
1449 return true;
1450 }
1451
legalizeSmallCMGlobalValue(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & MIRBuilder,GISelChangeObserver & Observer) const1452 bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(
1453 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
1454 GISelChangeObserver &Observer) const {
1455 assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE);
1456 // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP +
1457 // G_ADD_LOW instructions.
1458 // By splitting this here, we can optimize accesses in the small code model by
1459 // folding in the G_ADD_LOW into the load/store offset.
1460 auto &GlobalOp = MI.getOperand(1);
1461 // Don't modify an intrinsic call.
1462 if (GlobalOp.isSymbol())
1463 return true;
1464 const auto* GV = GlobalOp.getGlobal();
1465 if (GV->isThreadLocal())
1466 return true; // Don't want to modify TLS vars.
1467
1468 auto &TM = ST->getTargetLowering()->getTargetMachine();
1469 unsigned OpFlags = ST->ClassifyGlobalReference(GV, TM);
1470
1471 if (OpFlags & AArch64II::MO_GOT)
1472 return true;
1473
1474 auto Offset = GlobalOp.getOffset();
1475 Register DstReg = MI.getOperand(0).getReg();
1476 auto ADRP = MIRBuilder.buildInstr(AArch64::ADRP, {LLT::pointer(0, 64)}, {})
1477 .addGlobalAddress(GV, Offset, OpFlags | AArch64II::MO_PAGE);
1478 // Set the regclass on the dest reg too.
1479 MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
1480
1481 // MO_TAGGED on the page indicates a tagged address. Set the tag now. We do so
1482 // by creating a MOVK that sets bits 48-63 of the register to (global address
1483 // + 0x100000000 - PC) >> 48. The additional 0x100000000 offset here is to
1484 // prevent an incorrect tag being generated during relocation when the
1485 // global appears before the code section. Without the offset, a global at
1486 // `0x0f00'0000'0000'1000` (i.e. at `0x1000` with tag `0xf`) that's referenced
1487 // by code at `0x2000` would result in `0x0f00'0000'0000'1000 - 0x2000 =
1488 // 0x0eff'ffff'ffff'f000`, meaning the tag would be incorrectly set to `0xe`
1489 // instead of `0xf`.
1490 // This assumes that we're in the small code model so we can assume a binary
1491 // size of <= 4GB, which makes the untagged PC relative offset positive. The
1492 // binary must also be loaded into address range [0, 2^48). Both of these
1493 // properties need to be ensured at runtime when using tagged addresses.
1494 if (OpFlags & AArch64II::MO_TAGGED) {
1495 assert(!Offset &&
1496 "Should not have folded in an offset for a tagged global!");
1497 ADRP = MIRBuilder.buildInstr(AArch64::MOVKXi, {LLT::pointer(0, 64)}, {ADRP})
1498 .addGlobalAddress(GV, 0x100000000,
1499 AArch64II::MO_PREL | AArch64II::MO_G3)
1500 .addImm(48);
1501 MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
1502 }
1503
1504 MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP})
1505 .addGlobalAddress(GV, Offset,
1506 OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
1507 MI.eraseFromParent();
1508 return true;
1509 }
1510
legalizeIntrinsic(LegalizerHelper & Helper,MachineInstr & MI) const1511 bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
1512 MachineInstr &MI) const {
1513 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
1514 switch (IntrinsicID) {
1515 case Intrinsic::vacopy: {
1516 unsigned PtrSize = ST->isTargetILP32() ? 4 : 8;
1517 unsigned VaListSize =
1518 (ST->isTargetDarwin() || ST->isTargetWindows())
1519 ? PtrSize
1520 : ST->isTargetILP32() ? 20 : 32;
1521
1522 MachineFunction &MF = *MI.getMF();
1523 auto Val = MF.getRegInfo().createGenericVirtualRegister(
1524 LLT::scalar(VaListSize * 8));
1525 MachineIRBuilder MIB(MI);
1526 MIB.buildLoad(Val, MI.getOperand(2),
1527 *MF.getMachineMemOperand(MachinePointerInfo(),
1528 MachineMemOperand::MOLoad,
1529 VaListSize, Align(PtrSize)));
1530 MIB.buildStore(Val, MI.getOperand(1),
1531 *MF.getMachineMemOperand(MachinePointerInfo(),
1532 MachineMemOperand::MOStore,
1533 VaListSize, Align(PtrSize)));
1534 MI.eraseFromParent();
1535 return true;
1536 }
1537 case Intrinsic::get_dynamic_area_offset: {
1538 MachineIRBuilder &MIB = Helper.MIRBuilder;
1539 MIB.buildConstant(MI.getOperand(0).getReg(), 0);
1540 MI.eraseFromParent();
1541 return true;
1542 }
1543 case Intrinsic::aarch64_mops_memset_tag: {
1544 assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
1545 // Anyext the value being set to 64 bit (only the bottom 8 bits are read by
1546 // the instruction).
1547 MachineIRBuilder MIB(MI);
1548 auto &Value = MI.getOperand(3);
1549 Register ExtValueReg = MIB.buildAnyExt(LLT::scalar(64), Value).getReg(0);
1550 Value.setReg(ExtValueReg);
1551 return true;
1552 }
1553 case Intrinsic::aarch64_prefetch: {
1554 MachineIRBuilder MIB(MI);
1555 auto &AddrVal = MI.getOperand(1);
1556
1557 int64_t IsWrite = MI.getOperand(2).getImm();
1558 int64_t Target = MI.getOperand(3).getImm();
1559 int64_t IsStream = MI.getOperand(4).getImm();
1560 int64_t IsData = MI.getOperand(5).getImm();
1561
1562 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
1563 (!IsData << 3) | // IsDataCache bit
1564 (Target << 1) | // Cache level bits
1565 (unsigned)IsStream; // Stream bit
1566
1567 MIB.buildInstr(AArch64::G_AARCH64_PREFETCH).addImm(PrfOp).add(AddrVal);
1568 MI.eraseFromParent();
1569 return true;
1570 }
1571 case Intrinsic::aarch64_neon_uaddv:
1572 case Intrinsic::aarch64_neon_saddv:
1573 case Intrinsic::aarch64_neon_umaxv:
1574 case Intrinsic::aarch64_neon_smaxv:
1575 case Intrinsic::aarch64_neon_uminv:
1576 case Intrinsic::aarch64_neon_sminv: {
1577 MachineIRBuilder MIB(MI);
1578 MachineRegisterInfo &MRI = *MIB.getMRI();
1579 bool IsSigned = IntrinsicID == Intrinsic::aarch64_neon_saddv ||
1580 IntrinsicID == Intrinsic::aarch64_neon_smaxv ||
1581 IntrinsicID == Intrinsic::aarch64_neon_sminv;
1582
1583 auto OldDst = MI.getOperand(0).getReg();
1584 auto OldDstTy = MRI.getType(OldDst);
1585 LLT NewDstTy = MRI.getType(MI.getOperand(2).getReg()).getElementType();
1586 if (OldDstTy == NewDstTy)
1587 return true;
1588
1589 auto NewDst = MRI.createGenericVirtualRegister(NewDstTy);
1590
1591 Helper.Observer.changingInstr(MI);
1592 MI.getOperand(0).setReg(NewDst);
1593 Helper.Observer.changedInstr(MI);
1594
1595 MIB.setInsertPt(MIB.getMBB(), ++MIB.getInsertPt());
1596 MIB.buildExtOrTrunc(IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT,
1597 OldDst, NewDst);
1598
1599 return true;
1600 }
1601 case Intrinsic::aarch64_neon_uaddlp:
1602 case Intrinsic::aarch64_neon_saddlp: {
1603 MachineIRBuilder MIB(MI);
1604
1605 unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlp
1606 ? AArch64::G_UADDLP
1607 : AArch64::G_SADDLP;
1608 MIB.buildInstr(Opc, {MI.getOperand(0)}, {MI.getOperand(2)});
1609 MI.eraseFromParent();
1610
1611 return true;
1612 }
1613 case Intrinsic::aarch64_neon_uaddlv:
1614 case Intrinsic::aarch64_neon_saddlv: {
1615 MachineIRBuilder MIB(MI);
1616 MachineRegisterInfo &MRI = *MIB.getMRI();
1617
1618 unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlv
1619 ? AArch64::G_UADDLV
1620 : AArch64::G_SADDLV;
1621 Register DstReg = MI.getOperand(0).getReg();
1622 Register SrcReg = MI.getOperand(2).getReg();
1623 LLT DstTy = MRI.getType(DstReg);
1624
1625 LLT MidTy, ExtTy;
1626 if (DstTy.isScalar() && DstTy.getScalarSizeInBits() <= 32) {
1627 MidTy = LLT::fixed_vector(4, 32);
1628 ExtTy = LLT::scalar(32);
1629 } else {
1630 MidTy = LLT::fixed_vector(2, 64);
1631 ExtTy = LLT::scalar(64);
1632 }
1633
1634 Register MidReg =
1635 MIB.buildInstr(Opc, {MidTy}, {SrcReg})->getOperand(0).getReg();
1636 Register ZeroReg =
1637 MIB.buildConstant(LLT::scalar(64), 0)->getOperand(0).getReg();
1638 Register ExtReg = MIB.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT, {ExtTy},
1639 {MidReg, ZeroReg})
1640 .getReg(0);
1641
1642 if (DstTy.getScalarSizeInBits() < 32)
1643 MIB.buildTrunc(DstReg, ExtReg);
1644 else
1645 MIB.buildCopy(DstReg, ExtReg);
1646
1647 MI.eraseFromParent();
1648
1649 return true;
1650 }
1651 case Intrinsic::aarch64_neon_smax:
1652 case Intrinsic::aarch64_neon_smin:
1653 case Intrinsic::aarch64_neon_umax:
1654 case Intrinsic::aarch64_neon_umin:
1655 case Intrinsic::aarch64_neon_fmax:
1656 case Intrinsic::aarch64_neon_fmin:
1657 case Intrinsic::aarch64_neon_fmaxnm:
1658 case Intrinsic::aarch64_neon_fminnm: {
1659 MachineIRBuilder MIB(MI);
1660 if (IntrinsicID == Intrinsic::aarch64_neon_smax)
1661 MIB.buildSMax(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3));
1662 else if (IntrinsicID == Intrinsic::aarch64_neon_smin)
1663 MIB.buildSMin(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3));
1664 else if (IntrinsicID == Intrinsic::aarch64_neon_umax)
1665 MIB.buildUMax(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3));
1666 else if (IntrinsicID == Intrinsic::aarch64_neon_umin)
1667 MIB.buildUMin(MI.getOperand(0), MI.getOperand(2), MI.getOperand(3));
1668 else if (IntrinsicID == Intrinsic::aarch64_neon_fmax)
1669 MIB.buildInstr(TargetOpcode::G_FMAXIMUM, {MI.getOperand(0)},
1670 {MI.getOperand(2), MI.getOperand(3)});
1671 else if (IntrinsicID == Intrinsic::aarch64_neon_fmin)
1672 MIB.buildInstr(TargetOpcode::G_FMINIMUM, {MI.getOperand(0)},
1673 {MI.getOperand(2), MI.getOperand(3)});
1674 else if (IntrinsicID == Intrinsic::aarch64_neon_fmaxnm)
1675 MIB.buildInstr(TargetOpcode::G_FMAXNUM, {MI.getOperand(0)},
1676 {MI.getOperand(2), MI.getOperand(3)});
1677 else if (IntrinsicID == Intrinsic::aarch64_neon_fminnm)
1678 MIB.buildInstr(TargetOpcode::G_FMINNUM, {MI.getOperand(0)},
1679 {MI.getOperand(2), MI.getOperand(3)});
1680 MI.eraseFromParent();
1681 return true;
1682 }
1683 case Intrinsic::vector_reverse:
1684 // TODO: Add support for vector_reverse
1685 return false;
1686 }
1687
1688 return true;
1689 }
1690
legalizeShlAshrLshr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & MIRBuilder,GISelChangeObserver & Observer) const1691 bool AArch64LegalizerInfo::legalizeShlAshrLshr(
1692 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
1693 GISelChangeObserver &Observer) const {
1694 assert(MI.getOpcode() == TargetOpcode::G_ASHR ||
1695 MI.getOpcode() == TargetOpcode::G_LSHR ||
1696 MI.getOpcode() == TargetOpcode::G_SHL);
1697 // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the
1698 // imported patterns can select it later. Either way, it will be legal.
1699 Register AmtReg = MI.getOperand(2).getReg();
1700 auto VRegAndVal = getIConstantVRegValWithLookThrough(AmtReg, MRI);
1701 if (!VRegAndVal)
1702 return true;
1703 // Check the shift amount is in range for an immediate form.
1704 int64_t Amount = VRegAndVal->Value.getSExtValue();
1705 if (Amount > 31)
1706 return true; // This will have to remain a register variant.
1707 auto ExtCst = MIRBuilder.buildConstant(LLT::scalar(64), Amount);
1708 Observer.changingInstr(MI);
1709 MI.getOperand(2).setReg(ExtCst.getReg(0));
1710 Observer.changedInstr(MI);
1711 return true;
1712 }
1713
matchLDPSTPAddrMode(Register Root,Register & Base,int & Offset,MachineRegisterInfo & MRI)1714 static void matchLDPSTPAddrMode(Register Root, Register &Base, int &Offset,
1715 MachineRegisterInfo &MRI) {
1716 Base = Root;
1717 Offset = 0;
1718
1719 Register NewBase;
1720 int64_t NewOffset;
1721 if (mi_match(Root, MRI, m_GPtrAdd(m_Reg(NewBase), m_ICst(NewOffset))) &&
1722 isShiftedInt<7, 3>(NewOffset)) {
1723 Base = NewBase;
1724 Offset = NewOffset;
1725 }
1726 }
1727
1728 // FIXME: This should be removed and replaced with the generic bitcast legalize
1729 // action.
legalizeLoadStore(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & MIRBuilder,GISelChangeObserver & Observer) const1730 bool AArch64LegalizerInfo::legalizeLoadStore(
1731 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
1732 GISelChangeObserver &Observer) const {
1733 assert(MI.getOpcode() == TargetOpcode::G_STORE ||
1734 MI.getOpcode() == TargetOpcode::G_LOAD);
1735 // Here we just try to handle vector loads/stores where our value type might
1736 // have pointer elements, which the SelectionDAG importer can't handle. To
1737 // allow the existing patterns for s64 to fire for p0, we just try to bitcast
1738 // the value to use s64 types.
1739
1740 // Custom legalization requires the instruction, if not deleted, must be fully
1741 // legalized. In order to allow further legalization of the inst, we create
1742 // a new instruction and erase the existing one.
1743
1744 Register ValReg = MI.getOperand(0).getReg();
1745 const LLT ValTy = MRI.getType(ValReg);
1746
1747 if (ValTy == LLT::scalar(128)) {
1748
1749 AtomicOrdering Ordering = (*MI.memoperands_begin())->getSuccessOrdering();
1750 bool IsLoad = MI.getOpcode() == TargetOpcode::G_LOAD;
1751 bool IsLoadAcquire = IsLoad && Ordering == AtomicOrdering::Acquire;
1752 bool IsStoreRelease = !IsLoad && Ordering == AtomicOrdering::Release;
1753 bool IsRcpC3 =
1754 ST->hasLSE2() && ST->hasRCPC3() && (IsLoadAcquire || IsStoreRelease);
1755
1756 LLT s64 = LLT::scalar(64);
1757
1758 unsigned Opcode;
1759 if (IsRcpC3) {
1760 Opcode = IsLoad ? AArch64::LDIAPPX : AArch64::STILPX;
1761 } else {
1762 // For LSE2, loads/stores should have been converted to monotonic and had
1763 // a fence inserted after them.
1764 assert(Ordering == AtomicOrdering::Monotonic ||
1765 Ordering == AtomicOrdering::Unordered);
1766 assert(ST->hasLSE2() && "ldp/stp not single copy atomic without +lse2");
1767
1768 Opcode = IsLoad ? AArch64::LDPXi : AArch64::STPXi;
1769 }
1770
1771 MachineInstrBuilder NewI;
1772 if (IsLoad) {
1773 NewI = MIRBuilder.buildInstr(Opcode, {s64, s64}, {});
1774 MIRBuilder.buildMergeLikeInstr(
1775 ValReg, {NewI->getOperand(0), NewI->getOperand(1)});
1776 } else {
1777 auto Split = MIRBuilder.buildUnmerge(s64, MI.getOperand(0));
1778 NewI = MIRBuilder.buildInstr(
1779 Opcode, {}, {Split->getOperand(0), Split->getOperand(1)});
1780 }
1781
1782 if (IsRcpC3) {
1783 NewI.addUse(MI.getOperand(1).getReg());
1784 } else {
1785 Register Base;
1786 int Offset;
1787 matchLDPSTPAddrMode(MI.getOperand(1).getReg(), Base, Offset, MRI);
1788 NewI.addUse(Base);
1789 NewI.addImm(Offset / 8);
1790 }
1791
1792 NewI.cloneMemRefs(MI);
1793 constrainSelectedInstRegOperands(*NewI, *ST->getInstrInfo(),
1794 *MRI.getTargetRegisterInfo(),
1795 *ST->getRegBankInfo());
1796 MI.eraseFromParent();
1797 return true;
1798 }
1799
1800 if (!ValTy.isPointerVector() ||
1801 ValTy.getElementType().getAddressSpace() != 0) {
1802 LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store");
1803 return false;
1804 }
1805
1806 unsigned PtrSize = ValTy.getElementType().getSizeInBits();
1807 const LLT NewTy = LLT::vector(ValTy.getElementCount(), PtrSize);
1808 auto &MMO = **MI.memoperands_begin();
1809 MMO.setType(NewTy);
1810
1811 if (MI.getOpcode() == TargetOpcode::G_STORE) {
1812 auto Bitcast = MIRBuilder.buildBitcast(NewTy, ValReg);
1813 MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1), MMO);
1814 } else {
1815 auto NewLoad = MIRBuilder.buildLoad(NewTy, MI.getOperand(1), MMO);
1816 MIRBuilder.buildBitcast(ValReg, NewLoad);
1817 }
1818 MI.eraseFromParent();
1819 return true;
1820 }
1821
legalizeVaArg(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & MIRBuilder) const1822 bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
1823 MachineRegisterInfo &MRI,
1824 MachineIRBuilder &MIRBuilder) const {
1825 MachineFunction &MF = MIRBuilder.getMF();
1826 Align Alignment(MI.getOperand(2).getImm());
1827 Register Dst = MI.getOperand(0).getReg();
1828 Register ListPtr = MI.getOperand(1).getReg();
1829
1830 LLT PtrTy = MRI.getType(ListPtr);
1831 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1832
1833 const unsigned PtrSize = PtrTy.getSizeInBits() / 8;
1834 const Align PtrAlign = Align(PtrSize);
1835 auto List = MIRBuilder.buildLoad(
1836 PtrTy, ListPtr,
1837 *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
1838 PtrTy, PtrAlign));
1839
1840 MachineInstrBuilder DstPtr;
1841 if (Alignment > PtrAlign) {
1842 // Realign the list to the actual required alignment.
1843 auto AlignMinus1 =
1844 MIRBuilder.buildConstant(IntPtrTy, Alignment.value() - 1);
1845 auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0));
1846 DstPtr = MIRBuilder.buildMaskLowPtrBits(PtrTy, ListTmp, Log2(Alignment));
1847 } else
1848 DstPtr = List;
1849
1850 LLT ValTy = MRI.getType(Dst);
1851 uint64_t ValSize = ValTy.getSizeInBits() / 8;
1852 MIRBuilder.buildLoad(
1853 Dst, DstPtr,
1854 *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
1855 ValTy, std::max(Alignment, PtrAlign)));
1856
1857 auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrAlign));
1858
1859 auto NewList = MIRBuilder.buildPtrAdd(PtrTy, DstPtr, Size.getReg(0));
1860
1861 MIRBuilder.buildStore(NewList, ListPtr,
1862 *MF.getMachineMemOperand(MachinePointerInfo(),
1863 MachineMemOperand::MOStore,
1864 PtrTy, PtrAlign));
1865
1866 MI.eraseFromParent();
1867 return true;
1868 }
1869
legalizeBitfieldExtract(MachineInstr & MI,MachineRegisterInfo & MRI,LegalizerHelper & Helper) const1870 bool AArch64LegalizerInfo::legalizeBitfieldExtract(
1871 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
1872 // Only legal if we can select immediate forms.
1873 // TODO: Lower this otherwise.
1874 return getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI) &&
1875 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
1876 }
1877
legalizeCTPOP(MachineInstr & MI,MachineRegisterInfo & MRI,LegalizerHelper & Helper) const1878 bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI,
1879 MachineRegisterInfo &MRI,
1880 LegalizerHelper &Helper) const {
1881 // When there is no integer popcount instruction (FEAT_CSSC isn't available),
1882 // it can be more efficiently lowered to the following sequence that uses
1883 // AdvSIMD registers/instructions as long as the copies to/from the AdvSIMD
1884 // registers are cheap.
1885 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
1886 // CNT V0.8B, V0.8B // 8xbyte pop-counts
1887 // ADDV B0, V0.8B // sum 8xbyte pop-counts
1888 // UMOV X0, V0.B[0] // copy byte result back to integer reg
1889 //
1890 // For 128 bit vector popcounts, we lower to the following sequence:
1891 // cnt.16b v0, v0 // v8s16, v4s32, v2s64
1892 // uaddlp.8h v0, v0 // v8s16, v4s32, v2s64
1893 // uaddlp.4s v0, v0 // v4s32, v2s64
1894 // uaddlp.2d v0, v0 // v2s64
1895 //
1896 // For 64 bit vector popcounts, we lower to the following sequence:
1897 // cnt.8b v0, v0 // v4s16, v2s32
1898 // uaddlp.4h v0, v0 // v4s16, v2s32
1899 // uaddlp.2s v0, v0 // v2s32
1900
1901 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1902 Register Dst = MI.getOperand(0).getReg();
1903 Register Val = MI.getOperand(1).getReg();
1904 LLT Ty = MRI.getType(Val);
1905 unsigned Size = Ty.getSizeInBits();
1906
1907 assert(Ty == MRI.getType(Dst) &&
1908 "Expected src and dst to have the same type!");
1909
1910 if (ST->hasCSSC() && Ty.isScalar() && Size == 128) {
1911 LLT s64 = LLT::scalar(64);
1912
1913 auto Split = MIRBuilder.buildUnmerge(s64, Val);
1914 auto CTPOP1 = MIRBuilder.buildCTPOP(s64, Split->getOperand(0));
1915 auto CTPOP2 = MIRBuilder.buildCTPOP(s64, Split->getOperand(1));
1916 auto Add = MIRBuilder.buildAdd(s64, CTPOP1, CTPOP2);
1917
1918 MIRBuilder.buildZExt(Dst, Add);
1919 MI.eraseFromParent();
1920 return true;
1921 }
1922
1923 if (!ST->hasNEON() ||
1924 MI.getMF()->getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) {
1925 // Use generic lowering when custom lowering is not possible.
1926 return Ty.isScalar() && (Size == 32 || Size == 64) &&
1927 Helper.lowerBitCount(MI) ==
1928 LegalizerHelper::LegalizeResult::Legalized;
1929 }
1930
1931 // Pre-conditioning: widen Val up to the nearest vector type.
1932 // s32,s64,v4s16,v2s32 -> v8i8
1933 // v8s16,v4s32,v2s64 -> v16i8
1934 LLT VTy = Size == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8);
1935 if (Ty.isScalar()) {
1936 assert((Size == 32 || Size == 64 || Size == 128) && "Expected only 32, 64, or 128 bit scalars!");
1937 if (Size == 32) {
1938 Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0);
1939 }
1940 }
1941 Val = MIRBuilder.buildBitcast(VTy, Val).getReg(0);
1942
1943 // Count bits in each byte-sized lane.
1944 auto CTPOP = MIRBuilder.buildCTPOP(VTy, Val);
1945
1946 // Sum across lanes.
1947
1948 if (ST->hasDotProd() && Ty.isVector() && Ty.getNumElements() >= 2 &&
1949 Ty.getScalarSizeInBits() != 16) {
1950 LLT Dt = Ty == LLT::fixed_vector(2, 64) ? LLT::fixed_vector(4, 32) : Ty;
1951 auto Zeros = MIRBuilder.buildConstant(Dt, 0);
1952 auto Ones = MIRBuilder.buildConstant(VTy, 1);
1953 MachineInstrBuilder Sum;
1954
1955 if (Ty == LLT::fixed_vector(2, 64)) {
1956 auto UDOT =
1957 MIRBuilder.buildInstr(AArch64::G_UDOT, {Dt}, {Zeros, Ones, CTPOP});
1958 Sum = MIRBuilder.buildInstr(AArch64::G_UADDLP, {Ty}, {UDOT});
1959 } else if (Ty == LLT::fixed_vector(4, 32)) {
1960 Sum = MIRBuilder.buildInstr(AArch64::G_UDOT, {Dt}, {Zeros, Ones, CTPOP});
1961 } else if (Ty == LLT::fixed_vector(2, 32)) {
1962 Sum = MIRBuilder.buildInstr(AArch64::G_UDOT, {Dt}, {Zeros, Ones, CTPOP});
1963 } else {
1964 llvm_unreachable("unexpected vector shape");
1965 }
1966
1967 Sum->getOperand(0).setReg(Dst);
1968 MI.eraseFromParent();
1969 return true;
1970 }
1971
1972 Register HSum = CTPOP.getReg(0);
1973 unsigned Opc;
1974 SmallVector<LLT> HAddTys;
1975 if (Ty.isScalar()) {
1976 Opc = Intrinsic::aarch64_neon_uaddlv;
1977 HAddTys.push_back(LLT::scalar(32));
1978 } else if (Ty == LLT::fixed_vector(8, 16)) {
1979 Opc = Intrinsic::aarch64_neon_uaddlp;
1980 HAddTys.push_back(LLT::fixed_vector(8, 16));
1981 } else if (Ty == LLT::fixed_vector(4, 32)) {
1982 Opc = Intrinsic::aarch64_neon_uaddlp;
1983 HAddTys.push_back(LLT::fixed_vector(8, 16));
1984 HAddTys.push_back(LLT::fixed_vector(4, 32));
1985 } else if (Ty == LLT::fixed_vector(2, 64)) {
1986 Opc = Intrinsic::aarch64_neon_uaddlp;
1987 HAddTys.push_back(LLT::fixed_vector(8, 16));
1988 HAddTys.push_back(LLT::fixed_vector(4, 32));
1989 HAddTys.push_back(LLT::fixed_vector(2, 64));
1990 } else if (Ty == LLT::fixed_vector(4, 16)) {
1991 Opc = Intrinsic::aarch64_neon_uaddlp;
1992 HAddTys.push_back(LLT::fixed_vector(4, 16));
1993 } else if (Ty == LLT::fixed_vector(2, 32)) {
1994 Opc = Intrinsic::aarch64_neon_uaddlp;
1995 HAddTys.push_back(LLT::fixed_vector(4, 16));
1996 HAddTys.push_back(LLT::fixed_vector(2, 32));
1997 } else
1998 llvm_unreachable("unexpected vector shape");
1999 MachineInstrBuilder UADD;
2000 for (LLT HTy : HAddTys) {
2001 UADD = MIRBuilder.buildIntrinsic(Opc, {HTy}).addUse(HSum);
2002 HSum = UADD.getReg(0);
2003 }
2004
2005 // Post-conditioning.
2006 if (Ty.isScalar() && (Size == 64 || Size == 128))
2007 MIRBuilder.buildZExt(Dst, UADD);
2008 else
2009 UADD->getOperand(0).setReg(Dst);
2010 MI.eraseFromParent();
2011 return true;
2012 }
2013
legalizeAtomicCmpxchg128(MachineInstr & MI,MachineRegisterInfo & MRI,LegalizerHelper & Helper) const2014 bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128(
2015 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
2016 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2017 LLT s64 = LLT::scalar(64);
2018 auto Addr = MI.getOperand(1).getReg();
2019 auto DesiredI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(2));
2020 auto NewI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(3));
2021 auto DstLo = MRI.createGenericVirtualRegister(s64);
2022 auto DstHi = MRI.createGenericVirtualRegister(s64);
2023
2024 MachineInstrBuilder CAS;
2025 if (ST->hasLSE()) {
2026 // We have 128-bit CASP instructions taking XSeqPair registers, which are
2027 // s128. We need the merge/unmerge to bracket the expansion and pair up with
2028 // the rest of the MIR so we must reassemble the extracted registers into a
2029 // 128-bit known-regclass one with code like this:
2030 //
2031 // %in1 = REG_SEQUENCE Lo, Hi ; One for each input
2032 // %out = CASP %in1, ...
2033 // %OldLo = G_EXTRACT %out, 0
2034 // %OldHi = G_EXTRACT %out, 64
2035 auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
2036 unsigned Opcode;
2037 switch (Ordering) {
2038 case AtomicOrdering::Acquire:
2039 Opcode = AArch64::CASPAX;
2040 break;
2041 case AtomicOrdering::Release:
2042 Opcode = AArch64::CASPLX;
2043 break;
2044 case AtomicOrdering::AcquireRelease:
2045 case AtomicOrdering::SequentiallyConsistent:
2046 Opcode = AArch64::CASPALX;
2047 break;
2048 default:
2049 Opcode = AArch64::CASPX;
2050 break;
2051 }
2052
2053 LLT s128 = LLT::scalar(128);
2054 auto CASDst = MRI.createGenericVirtualRegister(s128);
2055 auto CASDesired = MRI.createGenericVirtualRegister(s128);
2056 auto CASNew = MRI.createGenericVirtualRegister(s128);
2057 MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASDesired}, {})
2058 .addUse(DesiredI->getOperand(0).getReg())
2059 .addImm(AArch64::sube64)
2060 .addUse(DesiredI->getOperand(1).getReg())
2061 .addImm(AArch64::subo64);
2062 MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASNew}, {})
2063 .addUse(NewI->getOperand(0).getReg())
2064 .addImm(AArch64::sube64)
2065 .addUse(NewI->getOperand(1).getReg())
2066 .addImm(AArch64::subo64);
2067
2068 CAS = MIRBuilder.buildInstr(Opcode, {CASDst}, {CASDesired, CASNew, Addr});
2069
2070 MIRBuilder.buildExtract({DstLo}, {CASDst}, 0);
2071 MIRBuilder.buildExtract({DstHi}, {CASDst}, 64);
2072 } else {
2073 // The -O0 CMP_SWAP_128 is friendlier to generate code for because LDXP/STXP
2074 // can take arbitrary registers so it just has the normal GPR64 operands the
2075 // rest of AArch64 is expecting.
2076 auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
2077 unsigned Opcode;
2078 switch (Ordering) {
2079 case AtomicOrdering::Acquire:
2080 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
2081 break;
2082 case AtomicOrdering::Release:
2083 Opcode = AArch64::CMP_SWAP_128_RELEASE;
2084 break;
2085 case AtomicOrdering::AcquireRelease:
2086 case AtomicOrdering::SequentiallyConsistent:
2087 Opcode = AArch64::CMP_SWAP_128;
2088 break;
2089 default:
2090 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
2091 break;
2092 }
2093
2094 auto Scratch = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
2095 CAS = MIRBuilder.buildInstr(Opcode, {DstLo, DstHi, Scratch},
2096 {Addr, DesiredI->getOperand(0),
2097 DesiredI->getOperand(1), NewI->getOperand(0),
2098 NewI->getOperand(1)});
2099 }
2100
2101 CAS.cloneMemRefs(MI);
2102 constrainSelectedInstRegOperands(*CAS, *ST->getInstrInfo(),
2103 *MRI.getTargetRegisterInfo(),
2104 *ST->getRegBankInfo());
2105
2106 MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {DstLo, DstHi});
2107 MI.eraseFromParent();
2108 return true;
2109 }
2110
legalizeCTTZ(MachineInstr & MI,LegalizerHelper & Helper) const2111 bool AArch64LegalizerInfo::legalizeCTTZ(MachineInstr &MI,
2112 LegalizerHelper &Helper) const {
2113 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2114 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
2115 LLT Ty = MRI.getType(MI.getOperand(1).getReg());
2116 auto BitReverse = MIRBuilder.buildBitReverse(Ty, MI.getOperand(1));
2117 MIRBuilder.buildCTLZ(MI.getOperand(0).getReg(), BitReverse);
2118 MI.eraseFromParent();
2119 return true;
2120 }
2121
legalizeMemOps(MachineInstr & MI,LegalizerHelper & Helper) const2122 bool AArch64LegalizerInfo::legalizeMemOps(MachineInstr &MI,
2123 LegalizerHelper &Helper) const {
2124 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2125
2126 // Tagged version MOPSMemorySetTagged is legalised in legalizeIntrinsic
2127 if (MI.getOpcode() == TargetOpcode::G_MEMSET) {
2128 // Anyext the value being set to 64 bit (only the bottom 8 bits are read by
2129 // the instruction).
2130 auto &Value = MI.getOperand(1);
2131 Register ExtValueReg =
2132 MIRBuilder.buildAnyExt(LLT::scalar(64), Value).getReg(0);
2133 Value.setReg(ExtValueReg);
2134 return true;
2135 }
2136
2137 return false;
2138 }
2139
legalizeExtractVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,LegalizerHelper & Helper) const2140 bool AArch64LegalizerInfo::legalizeExtractVectorElt(
2141 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
2142 assert(MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT);
2143 auto VRegAndVal =
2144 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2145 if (VRegAndVal)
2146 return true;
2147 return Helper.lowerExtractInsertVectorElt(MI) !=
2148 LegalizerHelper::LegalizeResult::UnableToLegalize;
2149 }
2150
legalizeDynStackAlloc(MachineInstr & MI,LegalizerHelper & Helper) const2151 bool AArch64LegalizerInfo::legalizeDynStackAlloc(
2152 MachineInstr &MI, LegalizerHelper &Helper) const {
2153 MachineFunction &MF = *MI.getParent()->getParent();
2154 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2155 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
2156
2157 // If stack probing is not enabled for this function, use the default
2158 // lowering.
2159 if (!MF.getFunction().hasFnAttribute("probe-stack") ||
2160 MF.getFunction().getFnAttribute("probe-stack").getValueAsString() !=
2161 "inline-asm") {
2162 Helper.lowerDynStackAlloc(MI);
2163 return true;
2164 }
2165
2166 Register Dst = MI.getOperand(0).getReg();
2167 Register AllocSize = MI.getOperand(1).getReg();
2168 Align Alignment = assumeAligned(MI.getOperand(2).getImm());
2169
2170 assert(MRI.getType(Dst) == LLT::pointer(0, 64) &&
2171 "Unexpected type for dynamic alloca");
2172 assert(MRI.getType(AllocSize) == LLT::scalar(64) &&
2173 "Unexpected type for dynamic alloca");
2174
2175 LLT PtrTy = MRI.getType(Dst);
2176 Register SPReg =
2177 Helper.getTargetLowering().getStackPointerRegisterToSaveRestore();
2178 Register SPTmp =
2179 Helper.getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy);
2180 auto NewMI =
2181 MIRBuilder.buildInstr(AArch64::PROBED_STACKALLOC_DYN, {}, {SPTmp});
2182 MRI.setRegClass(NewMI.getReg(0), &AArch64::GPR64commonRegClass);
2183 MIRBuilder.setInsertPt(*NewMI->getParent(), NewMI);
2184 MIRBuilder.buildCopy(Dst, SPTmp);
2185
2186 MI.eraseFromParent();
2187 return true;
2188 }
2189
legalizePrefetch(MachineInstr & MI,LegalizerHelper & Helper) const2190 bool AArch64LegalizerInfo::legalizePrefetch(MachineInstr &MI,
2191 LegalizerHelper &Helper) const {
2192 MachineIRBuilder &MIB = Helper.MIRBuilder;
2193 auto &AddrVal = MI.getOperand(0);
2194
2195 int64_t IsWrite = MI.getOperand(1).getImm();
2196 int64_t Locality = MI.getOperand(2).getImm();
2197 int64_t IsData = MI.getOperand(3).getImm();
2198
2199 bool IsStream = Locality == 0;
2200 if (Locality != 0) {
2201 assert(Locality <= 3 && "Prefetch locality out-of-range");
2202 // The locality degree is the opposite of the cache speed.
2203 // Put the number the other way around.
2204 // The encoding starts at 0 for level 1
2205 Locality = 3 - Locality;
2206 }
2207
2208 unsigned PrfOp = (IsWrite << 4) | (!IsData << 3) | (Locality << 1) | IsStream;
2209
2210 MIB.buildInstr(AArch64::G_AARCH64_PREFETCH).addImm(PrfOp).add(AddrVal);
2211 MI.eraseFromParent();
2212 return true;
2213 }
2214